diff --git a/src/eval/__init__.py b/src/eval/__init__.py
new file mode 100644
index 0000000..9b973b8
--- /dev/null
+++ b/src/eval/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""
+Report Quality Evaluation Module for DeerFlow.
+
+This module provides objective methods to evaluate generated report quality,
+including automated metrics and LLM-based evaluation.
+"""
+
+from .evaluator import ReportEvaluator
+from .metrics import ReportMetrics, compute_metrics
+from .llm_judge import LLMJudge, evaluate_with_llm
+
+__all__ = [
+    "ReportEvaluator",
+    "ReportMetrics",
+    "compute_metrics",
+    "LLMJudge",
+    "evaluate_with_llm",
+]
diff --git a/src/eval/evaluator.py b/src/eval/evaluator.py
new file mode 100644
index 0000000..de376ee
--- /dev/null
+++ b/src/eval/evaluator.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""
+Combined report evaluator orchestrating both automated metrics and LLM evaluation.
+"""
+
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+from .llm_judge import EvaluationResult, LLMJudge
+from .metrics import ReportMetrics, compute_metrics, get_word_count_target
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CombinedEvaluation:
+    """Combined evaluation results from metrics and LLM judge."""
+
+    metrics: ReportMetrics
+    llm_evaluation: Optional[EvaluationResult]
+    final_score: float
+    grade: str
+    summary: str
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary format."""
+        return {
+            "metrics": self.metrics.to_dict(),
+            "llm_evaluation": (
+                self.llm_evaluation.to_dict() if self.llm_evaluation else None
+            ),
+            "final_score": self.final_score,
+            "grade": self.grade,
+            "summary": self.summary,
+        }
+
+
+def score_to_grade(score: float) -> str:
+    """Convert numeric score to letter grade."""
+    if score >= 9.0:
+        return "A+"
+    elif score >= 8.5:
+        return "A"
+    elif score >= 8.0:
+        return "A-"
+    elif score >= 7.5:
+        return "B+"
+    elif score >= 7.0:
+        return "B"
+    elif score >= 6.5:
+        return "B-"
+    elif score >= 6.0:
+        return "C+"
+    elif score >= 5.5:
+        return "C"
+    elif score >= 5.0:
+        return "C-"
+    elif score >= 4.0:
+        return "D"
+    else:
+        return "F"
+
+
+class ReportEvaluator:
+    """
+    Combined report evaluator using both automated metrics and LLM-as-Judge.
+
+    This evaluator provides comprehensive report quality assessment by:
+    1. Computing automated metrics (fast, deterministic)
+    2. Running LLM-based evaluation (nuanced, contextual)
+    3. Combining both for a final score and grade
+    """
+
+    def __init__(self, llm: Any = None, use_llm: bool = True):
+        """
+        Initialize the evaluator.
+
+        Args:
+            llm: Optional LLM instance for LLM-as-Judge evaluation
+            use_llm: Whether to use LLM evaluation (can be disabled for speed)
+        """
+        self.use_llm = use_llm
+        self.llm_judge = LLMJudge(llm=llm) if use_llm else None
+
+    def _compute_metrics_score(
+        self, metrics: ReportMetrics, report_style: str
+    ) -> float:
+        """
+        Convert automated metrics to a 0-10 score.
+
+        Scoring breakdown:
+        - Section coverage: 30%
+        - Citation quality: 25%
+        - Word count compliance: 20%
+        - Source diversity: 15%
+        - Image inclusion: 10%
+        """
+        score = 0.0
+
+        section_score = metrics.section_coverage_score * 10
+        score += section_score * 0.30
+
+        citation_score = min(metrics.citation_count / 10, 1.0) * 10
+        score += citation_score * 0.25
+
+        target = get_word_count_target(report_style)
+        if target:
+            if target["min"] <= metrics.word_count <= target["max"]:
+                word_score = 10.0
+            elif metrics.word_count < target["min"]:
+                word_score = (metrics.word_count / target["min"]) * 8
+            else:
+                excess_ratio = metrics.word_count / target["max"]
+                word_score = max(10 - (excess_ratio - 1) * 5, 5)
+            score += word_score * 0.20
+
+        diversity_score = min(metrics.unique_sources / 5, 1.0) * 10
+        score += diversity_score * 0.15
+
+        image_score = min(metrics.image_count / 3, 1.0) * 10
+        score += image_score * 0.10
+
+        return round(score, 2)
+
+    def _generate_summary(
+        self,
+        metrics: ReportMetrics,
+        llm_eval: Optional[EvaluationResult],
+        final_score: float,
+        grade: str,
+    ) -> str:
+        """Generate a human-readable evaluation summary."""
+        lines = [f"Report Grade: {grade} ({final_score}/10)", ""]
+
+        lines.append("**Automated Metrics:**")
+        lines.append(f"- Word Count: {metrics.word_count}")
+        lines.append(f"- Citations: {metrics.citation_count}")
+        lines.append(f"- Unique Sources: {metrics.unique_sources}")
+        lines.append(f"- Images: {metrics.image_count}")
+        lines.append(
+            f"- Section Coverage: {metrics.section_coverage_score * 100:.0f}%"
+        )
+
+        if metrics.sections_missing:
+            lines.append(f"- Missing Sections: {', '.join(metrics.sections_missing)}")
+
+        if llm_eval:
+            lines.append("")
+            lines.append("**LLM Evaluation:**")
+            for criterion, score in llm_eval.scores.items():
+                lines.append(f"- {criterion.replace('_', ' ').title()}: {score}/10")
+
+            if llm_eval.strengths:
+                lines.append("")
+                lines.append("**Strengths:**")
+                for strength in llm_eval.strengths[:3]:
+                    lines.append(f"- {strength}")
+
+            if llm_eval.weaknesses:
+                lines.append("")
+                lines.append("**Areas for Improvement:**")
+                for weakness in llm_eval.weaknesses[:3]:
+                    lines.append(f"- {weakness}")
+
+        return "\n".join(lines)
+
+    async def evaluate(
+        self,
+        report: str,
+        query: str,
+        report_style: str = "default",
+    ) -> CombinedEvaluation:
+        """
+        Evaluate a report using both metrics and LLM.
+
+        Args:
+            report: The report text to evaluate
+            query: The original research query
+            report_style: The style of report
+
+        Returns:
+            CombinedEvaluation with full results
+        """
+        metrics = compute_metrics(report, report_style)
+        metrics_score = self._compute_metrics_score(metrics, report_style)
+
+        llm_eval = None
+        if self.use_llm and self.llm_judge:
+            try:
+                llm_eval = await self.llm_judge.evaluate(report, query, report_style)
+            except Exception as e:
+                logger.warning(f"LLM evaluation failed, using metrics only: {e}")
+
+        if llm_eval and llm_eval.overall_score > 0:
+            final_score = (metrics_score * 0.4) + (llm_eval.weighted_score * 0.6)
+        else:
+            final_score = metrics_score
+
+        final_score = round(final_score, 2)
+        grade = score_to_grade(final_score)
+
+        summary = self._generate_summary(metrics, llm_eval, final_score, grade)
+
+        return CombinedEvaluation(
+            metrics=metrics,
+            llm_evaluation=llm_eval,
+            final_score=final_score,
+            grade=grade,
+            summary=summary,
+        )
+
+    def evaluate_sync(
+        self,
+        report: str,
+        query: str,
+        report_style: str = "default",
+    ) -> CombinedEvaluation:
+        """Synchronous version of evaluate."""
+        import asyncio
+
+        return asyncio.run(self.evaluate(report, query, report_style))
+
+    def evaluate_metrics_only(
+        self,
+        report: str,
+        report_style: str = "default",
+    ) -> Dict[str, Any]:
+        """
+        Quick evaluation using only automated metrics (no LLM).
+
+        Args:
+            report: The report text to evaluate
+            report_style: The style of report
+
+        Returns:
+            Dictionary with metrics and score
+        """
+        metrics = compute_metrics(report, report_style)
+        metrics_score = self._compute_metrics_score(metrics, report_style)
+        grade = score_to_grade(metrics_score)
+
+        return {
+            "metrics": metrics.to_dict(),
+            "score": metrics_score,
+            "grade": grade,
+        }
diff --git a/src/eval/llm_judge.py b/src/eval/llm_judge.py
new file mode 100644
index 0000000..e0df521
--- /dev/null
+++ b/src/eval/llm_judge.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""
+LLM-as-Judge evaluation for report quality.
+
+Uses an LLM to evaluate reports on multiple quality dimensions,
+providing more nuanced assessment than automated metrics alone.
+"""
+
+import json
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from langchain_core.messages import HumanMessage, SystemMessage
+
+logger = logging.getLogger(__name__)
+
+# Maximum characters of report content to send to the LLM for evaluation.
+# This limit prevents exceeding LLM context windows and controls token usage.
+MAX_REPORT_LENGTH = 15000
+
+EVALUATION_CRITERIA = {
+    "factual_accuracy": {
+        "description": "Are claims supported by cited sources? Is information accurate and verifiable?",
+        "weight": 0.25,
+    },
+    "completeness": {
+        "description": "Does the report comprehensively cover all aspects of the topic?",
+        "weight": 0.20,
+    },
+    "coherence": {
+        "description": "Is the report logically structured, well-organized, and easy to follow?",
+        "weight": 0.20,
+    },
+    "relevance": {
+        "description": "Does the content directly address the research question without unnecessary tangents?",
+        "weight": 0.15,
+    },
+    "citation_quality": {
+        "description": "Are sources credible, diverse, and properly cited?",
+        "weight": 0.10,
+    },
+    "writing_quality": {
+        "description": "Is the writing clear, professional, and appropriate for the target audience?",
+        "weight": 0.10,
+    },
+}
+
+JUDGE_SYSTEM_PROMPT = """You are an expert report quality evaluator. Your task is to objectively assess the quality of research reports.
+
+Evaluate the report on the following criteria, scoring each from 1-10:
+
+1. **Factual Accuracy** (1-10): Are claims supported by cited sources? Is information accurate?
+2. **Completeness** (1-10): Does the report cover all aspects of the topic comprehensively?
+3. **Coherence** (1-10): Is the report logically structured and easy to follow?
+4. **Relevance** (1-10): Does content directly address the research question?
+5. **Citation Quality** (1-10): Are sources credible, diverse, and properly cited?
+6. **Writing Quality** (1-10): Is the writing clear and appropriate for the audience?
+
+Respond ONLY with a valid JSON object in this exact format:
+{
+    "scores": {
+        "factual_accuracy": <1-10>,
+        "completeness": <1-10>,
+        "coherence": <1-10>,
+        "relevance": <1-10>,
+        "citation_quality": <1-10>,
+        "writing_quality": <1-10>
+    },
+    "overall_score": <1-10>,
+    "strengths": ["strength1", "strength2"],
+    "weaknesses": ["weakness1", "weakness2"],
+    "suggestions": ["suggestion1", "suggestion2"]
+}
+
+Be objective and thorough in your evaluation."""
+
+
+@dataclass
+class EvaluationResult:
+    """Container for LLM evaluation results."""
+
+    scores: Dict[str, int]
+    overall_score: float
+    weighted_score: float
+    strengths: List[str]
+    weaknesses: List[str]
+    suggestions: List[str]
+    raw_response: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert evaluation result to dictionary."""
+        return {
+            "scores": self.scores,
+            "overall_score": self.overall_score,
+            "weighted_score": self.weighted_score,
+            "strengths": self.strengths,
+            "weaknesses": self.weaknesses,
+            "suggestions": self.suggestions,
+        }
+
+
+class LLMJudge:
+    """LLM-based report quality evaluator."""
+
+    def __init__(self, llm: Any = None):
+        """
+        Initialize the LLM Judge.
+
+        Args:
+            llm: LangChain-compatible LLM instance. If None, will be created on demand.
+        """
+        self._llm = llm
+
+    def _get_llm(self):
+        """Get or create the LLM instance."""
+        if self._llm is None:
+            from src.llms.llm import get_llm_by_type
+
+            self._llm = get_llm_by_type("basic")
+        return self._llm
+
+    def _calculate_weighted_score(self, scores: Dict[str, int]) -> float:
+        """Calculate weighted average score based on criteria weights."""
+        total_weight = 0
+        weighted_sum = 0
+
+        for criterion, score in scores.items():
+            if criterion in EVALUATION_CRITERIA:
+                weight = EVALUATION_CRITERIA[criterion]["weight"]
+                weighted_sum += score * weight
+                total_weight += weight
+
+        if total_weight > 0:
+            return round(weighted_sum / total_weight, 2)
+        return 0.0
+
+    def _parse_response(self, response: str) -> Dict[str, Any]:
+        """Parse LLM response into structured format."""
+        try:
+            json_match = response
+            if "```json" in response:
+                json_match = response.split("```json")[1].split("```")[0]
+            elif "```" in response:
+                json_match = response.split("```")[1].split("```")[0]
+
+            return json.loads(json_match.strip())
+        except (json.JSONDecodeError, IndexError) as e:
+            logger.warning(f"Failed to parse LLM response: {e}")
+            return {
+                "scores": {
+                    "factual_accuracy": 5,
+                    "completeness": 5,
+                    "coherence": 5,
+                    "relevance": 5,
+                    "citation_quality": 5,
+                    "writing_quality": 5,
+                },
+                "overall_score": 5,
+                "strengths": ["Unable to parse evaluation"],
+                "weaknesses": ["Evaluation parsing failed"],
+                "suggestions": ["Please re-run evaluation"],
+            }
+
+    async def evaluate(
+        self,
+        report: str,
+        query: str,
+        report_style: str = "default",
+    ) -> EvaluationResult:
+        """
+        Evaluate a report using LLM-as-Judge.
+
+        Args:
+            report: The report text to evaluate
+            query: The original research query
+            report_style: The style of report for context
+
+        Returns:
+            EvaluationResult with scores and feedback
+        """
+        llm = self._get_llm()
+
+        user_prompt = f"""Please evaluate the following research report.
+
+**Original Research Query:** {query}
+
+**Report Style:** {report_style}
+
+**Report to Evaluate:**
+{report[:MAX_REPORT_LENGTH]}
+
+Provide your evaluation in the specified JSON format."""
+
+        messages = [
+            SystemMessage(content=JUDGE_SYSTEM_PROMPT),
+            HumanMessage(content=user_prompt),
+        ]
+
+        try:
+            response = await llm.ainvoke(messages)
+            response_text = (
+                response.content if hasattr(response, "content") else str(response)
+            )
+
+            parsed = self._parse_response(response_text)
+
+            scores = parsed.get("scores", {})
+            weighted_score = self._calculate_weighted_score(scores)
+
+            return EvaluationResult(
+                scores=scores,
+                overall_score=parsed.get("overall_score", 5),
+                weighted_score=weighted_score,
+                strengths=parsed.get("strengths", []),
+                weaknesses=parsed.get("weaknesses", []),
+                suggestions=parsed.get("suggestions", []),
+                raw_response=response_text,
+            )
+
+        except Exception as e:
+            logger.error(f"LLM evaluation failed: {e}")
+            return EvaluationResult(
+                scores={
+                    "factual_accuracy": 0,
+                    "completeness": 0,
+                    "coherence": 0,
+                    "relevance": 0,
+                    "citation_quality": 0,
+                    "writing_quality": 0,
+                },
+                overall_score=0,
+                weighted_score=0,
+                strengths=[],
+                weaknesses=[f"Evaluation failed: {str(e)}"],
+                suggestions=["Please retry evaluation"],
+            )
+
+    def evaluate_sync(
+        self,
+        report: str,
+        query: str,
+        report_style: str = "default",
+    ) -> EvaluationResult:
+        """
+        Synchronous version of evaluate.
+
+        Args:
+            report: The report text to evaluate
+            query: The original research query
+            report_style: The style of report for context
+
+        Returns:
+            EvaluationResult with scores and feedback
+        """
+        import asyncio
+
+        return asyncio.run(self.evaluate(report, query, report_style))
+
+
+async def evaluate_with_llm(
+    report: str,
+    query: str,
+    report_style: str = "default",
+    llm: Any = None,
+) -> EvaluationResult:
+    """
+    Convenience function to evaluate a report with LLM.
+
+    Args:
+        report: The report text to evaluate
+        query: The original research query
+        report_style: The style of report for context
+        llm: Optional LLM instance to use
+
+    Returns:
+        EvaluationResult with scores and feedback
+    """
+    judge = LLMJudge(llm=llm)
+    return await judge.evaluate(report, query, report_style)
diff --git a/src/eval/metrics.py b/src/eval/metrics.py
new file mode 100644
index 0000000..3b61b70
--- /dev/null
+++ b/src/eval/metrics.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""
+Automated metrics for report quality evaluation.
+
+These metrics can be computed without LLM calls, providing fast and
+deterministic quality assessment.
+"""
+
+import re
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+from urllib.parse import urlparse
+
+
+@dataclass
+class ReportMetrics:
+    """Container for computed report metrics."""
+
+    word_count: int = 0
+    citation_count: int = 0
+    unique_sources: int = 0
+    image_count: int = 0
+    section_count: int = 0
+    sections_found: List[str] = field(default_factory=list)
+    sections_missing: List[str] = field(default_factory=list)
+    section_coverage_score: float = 0.0
+    has_title: bool = False
+    has_key_points: bool = False
+    has_overview: bool = False
+    has_citations_section: bool = False
+
+    def to_dict(self) -> Dict:
+        """Convert metrics to dictionary."""
+        return {
+            "word_count": self.word_count,
+            "citation_count": self.citation_count,
+            "unique_sources": self.unique_sources,
+            "image_count": self.image_count,
+            "section_count": self.section_count,
+            "sections_found": self.sections_found,
+            "sections_missing": self.sections_missing,
+            "section_coverage_score": self.section_coverage_score,
+            "has_title": self.has_title,
+            "has_key_points": self.has_key_points,
+            "has_overview": self.has_overview,
+            "has_citations_section": self.has_citations_section,
+        }
+
+
+# Required sections for different report styles
+REPORT_STYLE_SECTIONS = {
+    "default": [
+        "title",
+        "key_points",
+        "overview",
+        "detailed_analysis",
+        "key_citations",
+    ],
+    "academic": [
+        "title",
+        "key_points",
+        "overview",
+        "detailed_analysis",
+        "literature_review",
+        "methodology",
+        "key_citations",
+    ],
+    "news": [
+        "title",
+        "key_points",
+        "overview",
+        "detailed_analysis",
+        "key_citations",
+    ],
+    "popular_science": [
+        "title",
+        "key_points",
+        "overview",
+        "detailed_analysis",
+        "key_citations",
+    ],
+    "social_media": [
+        "title",
+        "key_points",
+        "overview",
+        "key_citations",
+    ],
+    "strategic_investment": [
+        "title",
+        "key_points",
+        "overview",
+        "detailed_analysis",
+        "executive_summary",
+        "market_analysis",
+        "technology_analysis",
+        "investment_recommendations",
+        "key_citations",
+    ],
+}
+
+# Section name patterns for detection (supports both English and Chinese)
+SECTION_PATTERNS = {
+    "title": r"^#\s+.+",
+    "key_points": r"(?:key\s*points|要点|关键发现|核心观点)",
+    "overview": r"(?:overview|概述|简介|背景)",
+    "detailed_analysis": r"(?:detailed\s*analysis|详细分析|深度分析|分析)",
+    "key_citations": r"(?:key\s*citations|references|参考文献|引用|来源)",
+    "literature_review": r"(?:literature\s*review|文献综述|研究回顾)",
+    "methodology": r"(?:methodology|方法论|研究方法)",
+    "executive_summary": r"(?:executive\s*summary|执行摘要|投资建议)",
+    "market_analysis": r"(?:market\s*analysis|市场分析|产业分析)",
+    "technology_analysis": r"(?:technology|技术.*(?:分析|解析|深度))",
+    "investment_recommendations": r"(?:investment.*recommend|投资建议|投资评级)",
+}
+
+
+def count_words(text: str) -> int:
+    """Count words in text, handling both English and Chinese."""
+    english_words = len(re.findall(r"\b[a-zA-Z]+\b", text))
+    chinese_chars = len(re.findall(r"[\u4e00-\u9fff]", text))
+    return english_words + chinese_chars
+
+
+def count_citations(text: str) -> int:
+    """Count markdown-style citations [text](url)."""
+    pattern = r"\[.+?\]\(https?://[^\s\)]+\)"
+    return len(re.findall(pattern, text))
+
+
+def extract_domains(text: str) -> List[str]:
+    """Extract unique domains from URLs in the text."""
+    url_pattern = r"https?://([^\s\)\]]+)"
+    urls = re.findall(url_pattern, text)
+    domains = set()
+    for url in urls:
+        try:
+            parsed = urlparse(f"http://{url}")
+            domain = parsed.netloc or url.split("/")[0]
+            domain = domain.lower().replace("www.", "")
+            if domain:
+                domains.add(domain)
+        except Exception:
+            continue
+    return list(domains)
+
+
+def count_images(text: str) -> int:
+    """Count markdown images ![alt](url)."""
+    pattern = r"!\[.*?\]\(.+?\)"
+    return len(re.findall(pattern, text))
+
+
+def detect_sections(text: str, report_style: str = "default") -> Dict[str, bool]:
+    """Detect which sections are present in the report."""
+    required_sections = REPORT_STYLE_SECTIONS.get(
+        report_style, REPORT_STYLE_SECTIONS["default"]
+    )
+    detected = {}
+
+    text_lower = text.lower()
+
+    for section in required_sections:
+        pattern = SECTION_PATTERNS.get(section, section.replace("_", r"\s*"))
+        if section == "title":
+            detected[section] = bool(re.search(pattern, text, re.MULTILINE))
+        else:
+            detected[section] = bool(
+                re.search(pattern, text_lower, re.IGNORECASE | re.MULTILINE)
+            )
+
+    return detected
+
+
+def compute_metrics(
+    report: str, report_style: str = "default", target_word_count: Optional[int] = None
+) -> ReportMetrics:
+    """
+    Compute automated metrics for a report.
+
+    Args:
+        report: The report text in markdown format
+        report_style: The style of report (academic, news, etc.)
+        target_word_count: Optional target word count for compliance check
+
+    Returns:
+        ReportMetrics object with computed values
+    """
+    metrics = ReportMetrics()
+
+    metrics.word_count = count_words(report)
+    metrics.citation_count = count_citations(report)
+
+    domains = extract_domains(report)
+    metrics.unique_sources = len(domains)
+
+    metrics.image_count = count_images(report)
+
+    sections_detected = detect_sections(report, report_style)
+    metrics.sections_found = [s for s, found in sections_detected.items() if found]
+    metrics.sections_missing = [
+        s for s, found in sections_detected.items() if not found
+    ]
+    metrics.section_count = len(metrics.sections_found)
+
+    total_sections = len(sections_detected)
+    if total_sections > 0:
+        metrics.section_coverage_score = len(metrics.sections_found) / total_sections
+
+    metrics.has_title = sections_detected.get("title", False)
+    metrics.has_key_points = sections_detected.get("key_points", False)
+    metrics.has_overview = sections_detected.get("overview", False)
+    metrics.has_citations_section = sections_detected.get("key_citations", False)
+
+    return metrics
+
+
+def get_word_count_target(report_style: str) -> Dict[str, int]:
+    """Get target word count range for a report style."""
+    targets = {
+        "strategic_investment": {"min": 10000, "max": 15000},
+        "academic": {"min": 3000, "max": 8000},
+        "news": {"min": 800, "max": 2000},
+        "popular_science": {"min": 1500, "max": 4000},
+        "social_media": {"min": 500, "max": 1500},
+        "default": {"min": 1000, "max": 5000},
+    }
+    return targets.get(report_style, targets["default"])
diff --git a/src/server/app.py b/src/server/app.py
index 951d318..bb334b7 100644
--- a/src/server/app.py
+++ b/src/server/app.py
@@ -35,6 +35,7 @@ from src.podcast.graph.builder import build_graph as build_podcast_graph
 from src.ppt.graph.builder import build_graph as build_ppt_graph
 from src.prompt_enhancer.graph.builder import build_graph as build_prompt_enhancer_graph
 from src.prose.graph.builder import build_graph as build_prose_graph
+from src.eval import ReportEvaluator
 from src.rag.builder import build_retriever
 from src.rag.milvus import load_examples as load_milvus_examples
 from src.rag.qdrant import load_examples as load_qdrant_examples
@@ -47,6 +48,7 @@ from src.server.chat_request import (
     GenerateProseRequest,
     TTSRequest,
 )
+from src.server.eval_request import EvaluateReportRequest, EvaluateReportResponse
 from src.server.config_request import ConfigResponse
 from src.server.mcp_request import MCPServerMetadataRequest, MCPServerMetadataResponse
 from src.server.mcp_utils import load_mcp_tools
@@ -946,6 +948,39 @@ async def generate_prose(request: GenerateProseRequest):
         raise HTTPException(status_code=500, detail=INTERNAL_SERVER_ERROR_DETAIL)
 
 
+@app.post("/api/report/evaluate", response_model=EvaluateReportResponse)
+async def evaluate_report(request: EvaluateReportRequest):
+    """Evaluate report quality using automated metrics and optionally LLM-as-Judge."""
+    try:
+        evaluator = ReportEvaluator(use_llm=request.use_llm)
+
+        if request.use_llm:
+            result = await evaluator.evaluate(
+                request.content, request.query, request.report_style or "default"
+            )
+            return EvaluateReportResponse(
+                metrics=result.metrics.to_dict(),
+                score=result.final_score,
+                grade=result.grade,
+                llm_evaluation=result.llm_evaluation.to_dict()
+                if result.llm_evaluation
+                else None,
+                summary=result.summary,
+            )
+        else:
+            result = evaluator.evaluate_metrics_only(
+                request.content, request.report_style or "default"
+            )
+            return EvaluateReportResponse(
+                metrics=result["metrics"],
+                score=result["score"],
+                grade=result["grade"],
+            )
+    except Exception as e:
+        logger.exception(f"Error occurred during report evaluation: {str(e)}")
+        raise HTTPException(status_code=500, detail=INTERNAL_SERVER_ERROR_DETAIL)
+
+
 @app.post("/api/prompt/enhance")
 async def enhance_prompt(request: EnhancePromptRequest):
     try:
diff --git a/src/server/eval_request.py b/src/server/eval_request.py
new file mode 100644
index 0000000..efd824a
--- /dev/null
+++ b/src/server/eval_request.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""Request models for report evaluation endpoint."""
+
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+
+class EvaluateReportRequest(BaseModel):
+    """Request model for report evaluation."""
+
+    content: str = Field(description="Report markdown content to evaluate")
+    query: str = Field(description="Original research query")
+    report_style: Optional[str] = Field(
+        default="default", description="Report style (academic, news, etc.)"
+    )
+    use_llm: bool = Field(
+        default=False,
+        description="Whether to use LLM for deep evaluation (slower but more detailed)",
+    )
+
+
+class EvaluationMetrics(BaseModel):
+    """Automated metrics result."""
+
+    word_count: int
+    citation_count: int
+    unique_sources: int
+    image_count: int
+    section_count: int
+    section_coverage_score: float
+    sections_found: list[str]
+    sections_missing: list[str]
+    has_title: bool
+    has_key_points: bool
+    has_overview: bool
+    has_citations_section: bool
+
+
+class LLMEvaluationScores(BaseModel):
+    """LLM evaluation scores."""
+
+    factual_accuracy: int = 0
+    completeness: int = 0
+    coherence: int = 0
+    relevance: int = 0
+    citation_quality: int = 0
+    writing_quality: int = 0
+
+
+class LLMEvaluation(BaseModel):
+    """LLM evaluation result."""
+
+    scores: LLMEvaluationScores
+    overall_score: float
+    weighted_score: float
+    strengths: list[str]
+    weaknesses: list[str]
+    suggestions: list[str]
+
+
+class EvaluateReportResponse(BaseModel):
+    """Response model for report evaluation."""
+
+    metrics: EvaluationMetrics
+    score: float
+    grade: str
+    llm_evaluation: Optional[LLMEvaluation] = None
+    summary: Optional[str] = None
diff --git a/tests/unit/eval/__init__.py b/tests/unit/eval/__init__.py
new file mode 100644
index 0000000..58bc29b
--- /dev/null
+++ b/tests/unit/eval/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
diff --git a/tests/unit/eval/test_evaluator.py b/tests/unit/eval/test_evaluator.py
new file mode 100644
index 0000000..7a9394f
--- /dev/null
+++ b/tests/unit/eval/test_evaluator.py
@@ -0,0 +1,489 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""Unit tests for the combined report evaluator."""
+
+import json
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from src.eval.evaluator import CombinedEvaluation, ReportEvaluator, score_to_grade
+from src.eval.llm_judge import (
+    EVALUATION_CRITERIA,
+    MAX_REPORT_LENGTH,
+    EvaluationResult,
+    LLMJudge,
+)
+from src.eval.metrics import ReportMetrics
+
+
+class TestScoreToGrade:
+    """Tests for score to grade conversion."""
+
+    def test_excellent_scores(self):
+        assert score_to_grade(9.5) == "A+"
+        assert score_to_grade(9.0) == "A+"
+        assert score_to_grade(8.7) == "A"
+        assert score_to_grade(8.5) == "A"
+        assert score_to_grade(8.2) == "A-"
+
+    def test_good_scores(self):
+        assert score_to_grade(7.8) == "B+"
+        assert score_to_grade(7.5) == "B+"
+        assert score_to_grade(7.2) == "B"
+        assert score_to_grade(7.0) == "B"
+        assert score_to_grade(6.7) == "B-"
+
+    def test_average_scores(self):
+        assert score_to_grade(6.2) == "C+"
+        assert score_to_grade(5.8) == "C"
+        assert score_to_grade(5.5) == "C"
+        assert score_to_grade(5.2) == "C-"
+
+    def test_poor_scores(self):
+        assert score_to_grade(4.5) == "D"
+        assert score_to_grade(4.0) == "D"
+        assert score_to_grade(3.0) == "F"
+        assert score_to_grade(1.0) == "F"
+
+
+class TestReportEvaluator:
+    """Tests for ReportEvaluator class."""
+
+    @pytest.fixture
+    def evaluator(self):
+        """Create evaluator without LLM for metrics-only tests."""
+        return ReportEvaluator(use_llm=False)
+
+    @pytest.fixture
+    def sample_report(self):
+        """Sample report for testing."""
+        return """
+# Comprehensive Research Report
+
+## Key Points
+- Important finding number one with significant implications
+- Critical discovery that changes our understanding
+- Key insight that provides actionable recommendations
+- Notable observation from the research data
+
+## Overview
+This report presents a comprehensive analysis of the research topic.
+The findings are based on extensive data collection and analysis.
+
+## Detailed Analysis
+
+### Section 1: Background
+The background of this research involves multiple factors.
+[Source 1](https://example.com/source1) provides foundational context.
+
+### Section 2: Methodology
+Our methodology follows established research practices.
+[Source 2](https://research.org/methods) outlines the approach.
+
+### Section 3: Findings
+The key findings include several important discoveries.
+![Research Data](https://example.com/chart.png)
+
+[Source 3](https://academic.edu/paper) supports these conclusions.
+
+## Key Citations
+- [Example Source](https://example.com/source1)
+- [Research Methods](https://research.org/methods)
+- [Academic Paper](https://academic.edu/paper)
+- [Additional Reference](https://reference.com/doc)
+        """
+
+    def test_evaluate_metrics_only(self, evaluator, sample_report):
+        """Test metrics-only evaluation."""
+        result = evaluator.evaluate_metrics_only(sample_report)
+
+        assert "metrics" in result
+        assert "score" in result
+        assert "grade" in result
+        assert result["score"] > 0
+        assert result["grade"] in ["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D", "F"]
+
+    def test_evaluate_metrics_only_structure(self, evaluator, sample_report):
+        """Test that metrics contain expected fields."""
+        result = evaluator.evaluate_metrics_only(sample_report)
+        metrics = result["metrics"]
+
+        assert "word_count" in metrics
+        assert "citation_count" in metrics
+        assert "unique_sources" in metrics
+        assert "image_count" in metrics
+        assert "section_coverage_score" in metrics
+
+    def test_evaluate_minimal_report(self, evaluator):
+        """Test evaluation of minimal report."""
+        minimal_report = "Just some text."
+        result = evaluator.evaluate_metrics_only(minimal_report)
+
+        assert result["score"] < 5.0
+        assert result["grade"] in ["D", "F"]
+
+    def test_metrics_score_calculation(self, evaluator):
+        """Test that metrics score is calculated correctly."""
+        good_report = """
+# Title
+
+## Key Points
+- Point 1
+- Point 2
+
+## Overview
+Overview content here.
+
+## Detailed Analysis
+Analysis with [cite](https://a.com) and [cite2](https://b.com) 
+and [cite3](https://c.com) and more [refs](https://d.com).
+
+![Image](https://img.com/1.png)
+
+## Key Citations
+- [A](https://a.com)
+- [B](https://b.com)
+        """
+        result = evaluator.evaluate_metrics_only(good_report)
+        assert result["score"] > 5.0
+
+    def test_combined_evaluation_to_dict(self):
+        """Test CombinedEvaluation to_dict method."""
+        metrics = ReportMetrics(
+            word_count=1000,
+            citation_count=5,
+            unique_sources=3,
+        )
+        evaluation = CombinedEvaluation(
+            metrics=metrics,
+            llm_evaluation=None,
+            final_score=7.5,
+            grade="B+",
+            summary="Test summary",
+        )
+
+        result = evaluation.to_dict()
+        assert result["final_score"] == 7.5
+        assert result["grade"] == "B+"
+        assert result["metrics"]["word_count"] == 1000
+
+
+class TestReportEvaluatorIntegration:
+    """Integration tests for evaluator (may require LLM)."""
+
+    @pytest.mark.asyncio
+    async def test_full_evaluation_without_llm(self):
+        """Test full evaluation with LLM disabled."""
+        evaluator = ReportEvaluator(use_llm=False)
+
+        report = """
+# Test Report
+
+## Key Points
+- Key point 1
+
+## Overview
+Test overview.
+
+## Key Citations
+- [Test](https://test.com)
+        """
+
+        result = await evaluator.evaluate(report, "test query")
+
+        assert isinstance(result, CombinedEvaluation)
+        assert result.final_score > 0
+        assert result.grade is not None
+        assert result.summary is not None
+        assert result.llm_evaluation is None
+
+
+class TestLLMJudgeParseResponse:
+    """Tests for LLMJudge._parse_response method."""
+
+    @pytest.fixture
+    def judge(self):
+        """Create LLMJudge with mock LLM."""
+        return LLMJudge(llm=MagicMock())
+
+    @pytest.fixture
+    def valid_response_data(self):
+        """Valid evaluation response data."""
+        return {
+            "scores": {
+                "factual_accuracy": 8,
+                "completeness": 7,
+                "coherence": 9,
+                "relevance": 8,
+                "citation_quality": 6,
+                "writing_quality": 8,
+            },
+            "overall_score": 8,
+            "strengths": ["Well researched", "Clear structure"],
+            "weaknesses": ["Could use more citations"],
+            "suggestions": ["Add more sources"],
+        }
+
+    def test_parse_valid_json(self, judge, valid_response_data):
+        """Test parsing valid JSON response."""
+        response = json.dumps(valid_response_data)
+        result = judge._parse_response(response)
+
+        assert result["scores"]["factual_accuracy"] == 8
+        assert result["overall_score"] == 8
+        assert "Well researched" in result["strengths"]
+
+    def test_parse_json_in_markdown_block(self, judge, valid_response_data):
+        """Test parsing JSON wrapped in markdown code block."""
+        response = f"```json\n{json.dumps(valid_response_data)}\n```"
+        result = judge._parse_response(response)
+
+        assert result["scores"]["coherence"] == 9
+        assert result["overall_score"] == 8
+
+    def test_parse_json_in_generic_code_block(self, judge, valid_response_data):
+        """Test parsing JSON in generic code block."""
+        response = f"```\n{json.dumps(valid_response_data)}\n```"
+        result = judge._parse_response(response)
+
+        assert result["scores"]["relevance"] == 8
+
+    def test_parse_malformed_json_returns_defaults(self, judge):
+        """Test that malformed JSON returns default scores."""
+        response = "This is not valid JSON at all"
+        result = judge._parse_response(response)
+
+        assert result["scores"]["factual_accuracy"] == 5
+        assert result["scores"]["completeness"] == 5
+        assert result["overall_score"] == 5
+        assert "Unable to parse evaluation" in result["strengths"]
+        assert "Evaluation parsing failed" in result["weaknesses"]
+
+    def test_parse_incomplete_json(self, judge):
+        """Test parsing incomplete JSON."""
+        response = '{"scores": {"factual_accuracy": 8}'  # Missing closing braces
+        result = judge._parse_response(response)
+
+        # Should return defaults due to parse failure
+        assert result["overall_score"] == 5
+
+    def test_parse_json_with_extra_text(self, judge, valid_response_data):
+        """Test parsing JSON with surrounding text."""
+        response = f"Here is my evaluation:\n```json\n{json.dumps(valid_response_data)}\n```\nHope this helps!"
+        result = judge._parse_response(response)
+
+        assert result["scores"]["factual_accuracy"] == 8
+
+
+class TestLLMJudgeCalculateWeightedScore:
+    """Tests for LLMJudge._calculate_weighted_score method."""
+
+    @pytest.fixture
+    def judge(self):
+        """Create LLMJudge with mock LLM."""
+        return LLMJudge(llm=MagicMock())
+
+    def test_calculate_with_all_scores(self, judge):
+        """Test weighted score calculation with all criteria."""
+        scores = {
+            "factual_accuracy": 10,  # weight 0.25
+            "completeness": 10,  # weight 0.20
+            "coherence": 10,  # weight 0.20
+            "relevance": 10,  # weight 0.15
+            "citation_quality": 10,  # weight 0.10
+            "writing_quality": 10,  # weight 0.10
+        }
+        result = judge._calculate_weighted_score(scores)
+        assert result == 10.0
+
+    def test_calculate_with_varied_scores(self, judge):
+        """Test weighted score with varied scores."""
+        scores = {
+            "factual_accuracy": 8,  # 8 * 0.25 = 2.0
+            "completeness": 6,  # 6 * 0.20 = 1.2
+            "coherence": 7,  # 7 * 0.20 = 1.4
+            "relevance": 9,  # 9 * 0.15 = 1.35
+            "citation_quality": 5,  # 5 * 0.10 = 0.5
+            "writing_quality": 8,  # 8 * 0.10 = 0.8
+        }
+        # Total: 7.25
+        result = judge._calculate_weighted_score(scores)
+        assert result == 7.25
+
+    def test_calculate_with_partial_scores(self, judge):
+        """Test weighted score with only some criteria."""
+        scores = {
+            "factual_accuracy": 8,  # weight 0.25
+            "completeness": 6,  # weight 0.20
+        }
+        # (8 * 0.25 + 6 * 0.20) / (0.25 + 0.20) = 3.2 / 0.45 = 7.11
+        result = judge._calculate_weighted_score(scores)
+        assert abs(result - 7.11) < 0.01
+
+    def test_calculate_with_unknown_criteria(self, judge):
+        """Test that unknown criteria are ignored."""
+        scores = {
+            "factual_accuracy": 10,
+            "unknown_criterion": 1,  # Should be ignored
+        }
+        result = judge._calculate_weighted_score(scores)
+        assert result == 10.0
+
+    def test_calculate_with_empty_scores(self, judge):
+        """Test with empty scores dict."""
+        result = judge._calculate_weighted_score({})
+        assert result == 0.0
+
+    def test_weights_sum_to_one(self):
+        """Verify that all criteria weights sum to 1.0."""
+        total_weight = sum(c["weight"] for c in EVALUATION_CRITERIA.values())
+        assert abs(total_weight - 1.0) < 0.001
+
+
+class TestLLMJudgeEvaluate:
+    """Tests for LLMJudge.evaluate method with mocked LLM."""
+
+    @pytest.fixture
+    def valid_llm_response(self):
+        """Create a valid LLM response."""
+        return json.dumps(
+            {
+                "scores": {
+                    "factual_accuracy": 8,
+                    "completeness": 7,
+                    "coherence": 9,
+                    "relevance": 8,
+                    "citation_quality": 7,
+                    "writing_quality": 8,
+                },
+                "overall_score": 8,
+                "strengths": ["Comprehensive coverage", "Well structured"],
+                "weaknesses": ["Some claims need more support"],
+                "suggestions": ["Add more academic sources"],
+            }
+        )
+
+    @pytest.mark.asyncio
+    async def test_successful_evaluation(self, valid_llm_response):
+        """Test successful LLM evaluation."""
+        mock_llm = AsyncMock()
+        mock_response = MagicMock()
+        mock_response.content = valid_llm_response
+        mock_llm.ainvoke.return_value = mock_response
+
+        judge = LLMJudge(llm=mock_llm)
+        result = await judge.evaluate("Test report", "Test query")
+
+        assert isinstance(result, EvaluationResult)
+        assert result.scores["factual_accuracy"] == 8
+        assert result.overall_score == 8
+        assert result.weighted_score > 0
+        assert "Comprehensive coverage" in result.strengths
+        assert result.raw_response == valid_llm_response
+
+    @pytest.mark.asyncio
+    async def test_evaluation_with_llm_failure(self):
+        """Test that LLM failures are handled gracefully."""
+        mock_llm = AsyncMock()
+        mock_llm.ainvoke.side_effect = Exception("LLM service unavailable")
+
+        judge = LLMJudge(llm=mock_llm)
+        result = await judge.evaluate("Test report", "Test query")
+
+        assert isinstance(result, EvaluationResult)
+        assert result.overall_score == 0
+        assert result.weighted_score == 0
+        assert all(score == 0 for score in result.scores.values())
+        assert any("failed" in w.lower() for w in result.weaknesses)
+
+    @pytest.mark.asyncio
+    async def test_evaluation_with_malformed_response(self):
+        """Test handling of malformed LLM response."""
+        mock_llm = AsyncMock()
+        mock_response = MagicMock()
+        mock_response.content = "I cannot evaluate this report properly."
+        mock_llm.ainvoke.return_value = mock_response
+
+        judge = LLMJudge(llm=mock_llm)
+        result = await judge.evaluate("Test report", "Test query")
+
+        # Should return default scores
+        assert result.scores["factual_accuracy"] == 5
+        assert result.overall_score == 5
+
+    @pytest.mark.asyncio
+    async def test_evaluation_passes_report_style(self):
+        """Test that report_style is passed to LLM."""
+        mock_llm = AsyncMock()
+        mock_response = MagicMock()
+        mock_response.content = json.dumps(
+            {
+                "scores": {k: 7 for k in EVALUATION_CRITERIA.keys()},
+                "overall_score": 7,
+                "strengths": [],
+                "weaknesses": [],
+                "suggestions": [],
+            }
+        )
+        mock_llm.ainvoke.return_value = mock_response
+
+        judge = LLMJudge(llm=mock_llm)
+        await judge.evaluate("Test report", "Test query", report_style="academic")
+
+        # Verify the prompt contains the report style
+        call_args = mock_llm.ainvoke.call_args
+        messages = call_args[0][0]
+        user_message_content = messages[1].content
+        assert "academic" in user_message_content
+
+    @pytest.mark.asyncio
+    async def test_evaluation_truncates_long_reports(self):
+        """Test that very long reports are truncated."""
+        mock_llm = AsyncMock()
+        mock_response = MagicMock()
+        mock_response.content = json.dumps(
+            {
+                "scores": {k: 7 for k in EVALUATION_CRITERIA.keys()},
+                "overall_score": 7,
+                "strengths": [],
+                "weaknesses": [],
+                "suggestions": [],
+            }
+        )
+        mock_llm.ainvoke.return_value = mock_response
+
+        judge = LLMJudge(llm=mock_llm)
+        long_report = "x" * (MAX_REPORT_LENGTH + 5000)
+        await judge.evaluate(long_report, "Test query")
+
+        call_args = mock_llm.ainvoke.call_args
+        messages = call_args[0][0]
+        user_message_content = messages[1].content
+        # The report content in the message should be truncated to MAX_REPORT_LENGTH
+        assert len(user_message_content) < len(long_report) + 500
+
+
+class TestEvaluationResult:
+    """Tests for EvaluationResult dataclass."""
+
+    def test_to_dict(self):
+        """Test EvaluationResult.to_dict method."""
+        result = EvaluationResult(
+            scores={"factual_accuracy": 8, "completeness": 7},
+            overall_score=7.5,
+            weighted_score=7.6,
+            strengths=["Good research"],
+            weaknesses=["Needs more detail"],
+            suggestions=["Expand section 2"],
+            raw_response="test response",
+        )
+
+        d = result.to_dict()
+        assert d["scores"]["factual_accuracy"] == 8
+        assert d["overall_score"] == 7.5
+        assert d["weighted_score"] == 7.6
+        assert "Good research" in d["strengths"]
+        # raw_response should not be in dict
+        assert "raw_response" not in d
diff --git a/tests/unit/eval/test_metrics.py b/tests/unit/eval/test_metrics.py
new file mode 100644
index 0000000..fbc38e1
--- /dev/null
+++ b/tests/unit/eval/test_metrics.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""Unit tests for report evaluation metrics."""
+
+from src.eval.metrics import (
+    compute_metrics,
+    count_citations,
+    count_images,
+    count_words,
+    detect_sections,
+    extract_domains,
+    get_word_count_target,
+)
+
+
+class TestCountWords:
+    """Tests for word counting function."""
+
+    def test_english_words(self):
+        text = "This is a simple test sentence."
+        assert count_words(text) == 6
+
+    def test_chinese_characters(self):
+        text = "这是一个测试"
+        assert count_words(text) == 6
+
+    def test_mixed_content(self):
+        text = "Hello 你好 World 世界"
+        assert count_words(text) == 4 + 2  # 2 English + 4 Chinese
+
+    def test_empty_string(self):
+        assert count_words("") == 0
+
+
+class TestCountCitations:
+    """Tests for citation counting function."""
+
+    def test_markdown_citations(self):
+        text = """
+        Check out [Google](https://google.com) and [GitHub](https://github.com).
+        """
+        assert count_citations(text) == 2
+
+    def test_no_citations(self):
+        text = "This is plain text without any links."
+        assert count_citations(text) == 0
+
+    def test_invalid_urls(self):
+        text = "[Link](not-a-url) [Another](ftp://ftp.example.com)"
+        assert count_citations(text) == 0
+
+    def test_complex_urls(self):
+        text = "[Article](https://example.com/path/to/article?id=123&ref=test)"
+        assert count_citations(text) == 1
+
+
+class TestExtractDomains:
+    """Tests for domain extraction function."""
+
+    def test_extract_multiple_domains(self):
+        text = """
+        https://google.com/search
+        https://www.github.com/user/repo
+        https://docs.python.org/3/
+        """
+        domains = extract_domains(text)
+        assert len(domains) == 3
+        assert "google.com" in domains
+        assert "github.com" in domains
+        assert "docs.python.org" in domains
+
+    def test_deduplicate_domains(self):
+        text = """
+        https://example.com/page1
+        https://example.com/page2
+        https://www.example.com/page3
+        """
+        domains = extract_domains(text)
+        assert len(domains) == 1
+        assert "example.com" in domains
+
+    def test_no_urls(self):
+        text = "Plain text without URLs"
+        assert extract_domains(text) == []
+
+
+class TestCountImages:
+    """Tests for image counting function."""
+
+    def test_markdown_images(self):
+        text = """
+        ![Alt text](https://example.com/image1.png)
+        ![](https://example.com/image2.jpg)
+        """
+        assert count_images(text) == 2
+
+    def test_no_images(self):
+        text = "Text without images [link](url)"
+        assert count_images(text) == 0
+
+
+class TestDetectSections:
+    """Tests for section detection function."""
+
+    def test_detect_title(self):
+        text = "# My Report Title\n\nSome content here."
+        sections = detect_sections(text)
+        assert sections.get("title") is True
+
+    def test_detect_key_points(self):
+        text = "## Key Points\n- Point 1\n- Point 2"
+        sections = detect_sections(text)
+        assert sections.get("key_points") is True
+
+    def test_detect_chinese_sections(self):
+        text = """# 报告标题
+## 要点
+- 要点1
+## 概述
+这是概述内容
+        """
+        sections = detect_sections(text)
+        assert sections.get("title") is True
+        assert sections.get("key_points") is True
+        assert sections.get("overview") is True
+
+    def test_detect_citations_section(self):
+        text = """
+        ## Key Citations
+        - [Source 1](https://example.com)
+        """
+        sections = detect_sections(text)
+        assert sections.get("key_citations") is True
+
+
+class TestComputeMetrics:
+    """Tests for the main compute_metrics function."""
+
+    def test_complete_report(self):
+        report = """
+# Research Report Title
+
+## Key Points
+- Point 1
+- Point 2
+- Point 3
+
+## Overview
+This is an overview of the research topic.
+
+## Detailed Analysis
+Here is the detailed analysis with [source](https://example.com).
+
+![Figure 1](https://example.com/image.png)
+
+## Key Citations
+- [Source 1](https://example.com)
+- [Source 2](https://another.com)
+        """
+        metrics = compute_metrics(report)
+
+        assert metrics.has_title is True
+        assert metrics.has_key_points is True
+        assert metrics.has_overview is True
+        assert metrics.has_citations_section is True
+        assert metrics.citation_count >= 2
+        assert metrics.image_count == 1
+        assert metrics.unique_sources >= 1
+        assert metrics.section_coverage_score > 0.5
+
+    def test_minimal_report(self):
+        report = "Just some text without structure."
+        metrics = compute_metrics(report)
+
+        assert metrics.has_title is False
+        assert metrics.citation_count == 0
+        assert metrics.section_coverage_score < 0.5
+
+    def test_metrics_to_dict(self):
+        report = "# Title\n\nSome content"
+        metrics = compute_metrics(report)
+        result = metrics.to_dict()
+
+        assert isinstance(result, dict)
+        assert "word_count" in result
+        assert "citation_count" in result
+        assert "section_coverage_score" in result
+
+
+class TestGetWordCountTarget:
+    """Tests for word count target function."""
+
+    def test_strategic_investment_target(self):
+        target = get_word_count_target("strategic_investment")
+        assert target["min"] == 10000
+        assert target["max"] == 15000
+
+    def test_news_target(self):
+        target = get_word_count_target("news")
+        assert target["min"] == 800
+        assert target["max"] == 2000
+
+    def test_default_target(self):
+        target = get_word_count_target("unknown_style")
+        assert target["min"] == 1000
+        assert target["max"] == 5000
diff --git a/web/messages/en.json b/web/messages/en.json
index 7cb1619..826fc1a 100644
--- a/web/messages/en.json
+++ b/web/messages/en.json
@@ -150,6 +150,7 @@
       "downloadWord": "Word (.docx)",
       "downloadImage": "Image (.png)",
       "exportFailed": "Export failed, please try again",
+      "evaluateReport": "Evaluate report quality",
       "searchingFor": "Searching for",
       "reading": "Reading",
       "runningPythonCode": "Running Python code",
@@ -163,6 +164,31 @@
       "errorGeneratingPodcast": "Error when generating podcast. Please try again.",
       "downloadPodcast": "Download podcast"
     },
+    "evaluation": {
+      "title": "Report Quality Evaluation",
+      "description": "Evaluate your report using automated metrics and AI analysis.",
+      "evaluating": "Evaluating report...",
+      "analyzing": "Running deep analysis...",
+      "overallScore": "Overall Score",
+      "metrics": "Report Metrics",
+      "wordCount": "Word Count",
+      "citations": "Citations",
+      "sources": "Unique Sources",
+      "images": "Images",
+      "sectionCoverage": "Section Coverage",
+      "detailedAnalysis": "Detailed Analysis",
+      "deepEvaluation": "Deep Evaluation (AI)",
+      "strengths": "Strengths",
+      "weaknesses": "Areas for Improvement",
+      "scores": {
+        "factual_accuracy": "Factual Accuracy",
+        "completeness": "Completeness",
+        "coherence": "Coherence",
+        "relevance": "Relevance",
+        "citation_quality": "Citation Quality",
+        "writing_quality": "Writing Quality"
+      }
+    },
     "messages": {
       "replaying": "Replaying",
       "replayDescription": "DeerFlow is now replaying the conversation...",
diff --git a/web/messages/zh.json b/web/messages/zh.json
index d51dff3..e84d9f9 100644
--- a/web/messages/zh.json
+++ b/web/messages/zh.json
@@ -150,6 +150,7 @@
       "downloadWord": "Word (.docx)",
       "downloadImage": "图片 (.png)",
       "exportFailed": "导出失败，请重试",
+      "evaluateReport": "评估报告质量",
       "searchingFor": "搜索",
       "reading": "阅读中",
       "runningPythonCode": "运行 Python 代码",
@@ -163,6 +164,31 @@
       "errorGeneratingPodcast": "生成播客时出错。请重试。",
       "downloadPodcast": "下载播客"
     },
+    "evaluation": {
+      "title": "报告质量评估",
+      "description": "使用自动化指标和 AI 分析评估您的报告。",
+      "evaluating": "正在评估报告...",
+      "analyzing": "正在进行深度分析...",
+      "overallScore": "总体评分",
+      "metrics": "报告指标",
+      "wordCount": "字数",
+      "citations": "引用数",
+      "sources": "独立来源",
+      "images": "图片数",
+      "sectionCoverage": "章节覆盖率",
+      "detailedAnalysis": "详细分析",
+      "deepEvaluation": "深度评估 (AI)",
+      "strengths": "优势",
+      "weaknesses": "改进建议",
+      "scores": {
+        "factual_accuracy": "事实准确性",
+        "completeness": "完整性",
+        "coherence": "连贯性",
+        "relevance": "相关性",
+        "citation_quality": "引用质量",
+        "writing_quality": "写作质量"
+      }
+    },
     "messages": {
       "replaying": "回放中",
       "replayDescription": "DeerFlow 正在回放对话...",
diff --git a/web/src/app/chat/components/evaluation-dialog.tsx b/web/src/app/chat/components/evaluation-dialog.tsx
new file mode 100644
index 0000000..6c3524c
--- /dev/null
+++ b/web/src/app/chat/components/evaluation-dialog.tsx
@@ -0,0 +1,300 @@
+// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+// SPDX-License-Identifier: MIT
+
+import {
+  BookOpen,
+  FileText,
+  Image,
+  Link2,
+  Loader2,
+  Sparkles,
+  ThumbsDown,
+  ThumbsUp,
+} from "lucide-react";
+import { useTranslations } from "next-intl";
+import { useCallback, useEffect, useRef, useState } from "react";
+
+import { Button } from "~/components/ui/button";
+import {
+  Dialog,
+  DialogContent,
+  DialogDescription,
+  DialogHeader,
+  DialogTitle,
+} from "~/components/ui/dialog";
+import { Progress } from "~/components/ui/progress";
+import { evaluateReport, type EvaluationResult } from "~/core/api";
+import { cn } from "~/lib/utils";
+
+interface EvaluationDialogProps {
+  open: boolean;
+  onOpenChange: (open: boolean) => void;
+  reportContent: string;
+  query: string;
+  reportStyle?: string;
+}
+
+function GradeBadge({ grade }: { grade: string }) {
+  const gradeColors: Record<string, string> = {
+    "A+": "bg-emerald-500",
+    A: "bg-emerald-500",
+    "A-": "bg-emerald-400",
+    "B+": "bg-blue-500",
+    B: "bg-blue-500",
+    "B-": "bg-blue-400",
+    "C+": "bg-yellow-500",
+    C: "bg-yellow-500",
+    "C-": "bg-yellow-400",
+    D: "bg-orange-500",
+    F: "bg-red-500",
+  };
+
+  return (
+    <div
+      aria-label={`Report grade: ${grade}`}
+      className={cn(
+        "flex h-16 w-16 items-center justify-center rounded-full text-2xl font-bold text-white",
+        gradeColors[grade] ?? "bg-gray-500",
+      )}
+    >
+      {grade}
+    </div>
+  );
+}
+
+function MetricItem({
+  icon: Icon,
+  label,
+  value,
+  suffix,
+}: {
+  icon: React.ComponentType<{ className?: string }>;
+  label: string;
+  value: number | string;
+  suffix?: string;
+}) {
+  return (
+    <div className="flex items-center gap-3">
+      <Icon className="text-muted-foreground h-4 w-4" />
+      <span className="text-muted-foreground text-sm">{label}</span>
+      <span className="ml-auto font-medium">
+        {value}
+        {suffix}
+      </span>
+    </div>
+  );
+}
+
+export function EvaluationDialog({
+  open,
+  onOpenChange,
+  reportContent,
+  query,
+  reportStyle,
+}: EvaluationDialogProps) {
+  const t = useTranslations("chat.evaluation");
+  const [loading, setLoading] = useState(false);
+  const [deepLoading, setDeepLoading] = useState(false);
+  const [result, setResult] = useState<EvaluationResult | null>(null);
+  const [error, setError] = useState<string | null>(null);
+  const hasRunInitialEvaluation = useRef(false);
+
+  const runEvaluation = useCallback(
+    async (useLlm: boolean) => {
+      if (useLlm) {
+        setDeepLoading(true);
+      } else {
+        setLoading(true);
+      }
+      setError(null);
+
+      try {
+        const evalResult = await evaluateReport(
+          reportContent,
+          query,
+          reportStyle,
+          useLlm,
+        );
+        setResult(evalResult);
+      } catch (err) {
+        setError(err instanceof Error ? err.message : "Evaluation failed");
+      } finally {
+        setLoading(false);
+        setDeepLoading(false);
+      }
+    },
+    [reportContent, query, reportStyle],
+  );
+
+  useEffect(() => {
+    if (open && !hasRunInitialEvaluation.current) {
+      hasRunInitialEvaluation.current = true;
+      void runEvaluation(false);
+    }
+  }, [open, runEvaluation]);
+
+  useEffect(() => {
+    if (!open) {
+      setResult(null);
+      setError(null);
+      hasRunInitialEvaluation.current = false;
+    }
+  }, [open]);
+
+  return (
+    <Dialog open={open} onOpenChange={onOpenChange}>
+      <DialogContent className="sm:max-w-md">
+        <DialogHeader>
+          <DialogTitle>{t("title")}</DialogTitle>
+          <DialogDescription>{t("description")}</DialogDescription>
+        </DialogHeader>
+
+        {loading && !result ? (
+          <div className="flex flex-col items-center justify-center py-8">
+            <Loader2 className="h-8 w-8 animate-spin text-blue-500" />
+            <p className="text-muted-foreground mt-4 text-sm">
+              {t("evaluating")}
+            </p>
+          </div>
+        ) : error ? (
+          <div className="py-4 text-center text-red-500">{error}</div>
+        ) : result ? (
+          <div className="space-y-6">
+            {/* Grade and Score */}
+            <div className="flex items-center gap-6">
+              <GradeBadge grade={result.grade} />
+              <div>
+                <div className="text-3xl font-bold">{result.score}/10</div>
+                <div className="text-muted-foreground text-sm">
+                  {t("overallScore")}
+                </div>
+              </div>
+            </div>
+
+            {/* Metrics */}
+            <div className="space-y-3">
+              <h4 className="text-sm font-medium">{t("metrics")}</h4>
+              <div className="bg-muted/50 space-y-2 rounded-lg p-3">
+                <MetricItem
+                  icon={FileText}
+                  label={t("wordCount")}
+                  value={result.metrics.word_count.toLocaleString()}
+                />
+                <MetricItem
+                  icon={Link2}
+                  label={t("citations")}
+                  value={result.metrics.citation_count}
+                />
+                <MetricItem
+                  icon={BookOpen}
+                  label={t("sources")}
+                  value={result.metrics.unique_sources}
+                />
+                <MetricItem
+                  icon={Image}
+                  label={t("images")}
+                  value={result.metrics.image_count}
+                />
+                <div className="pt-2">
+                  <div className="mb-1 flex items-center justify-between text-sm">
+                    <span className="text-muted-foreground">
+                      {t("sectionCoverage")}
+                    </span>
+                    <span className="font-medium">
+                      {Math.round(result.metrics.section_coverage_score * 100)}%
+                    </span>
+                  </div>
+                  <Progress
+                    value={result.metrics.section_coverage_score * 100}
+                    className="h-2"
+                  />
+                </div>
+              </div>
+            </div>
+
+            {/* LLM Evaluation Results */}
+            {result.llm_evaluation && (
+              <div className="space-y-3">
+                <h4 className="text-sm font-medium">{t("detailedAnalysis")}</h4>
+
+                {/* LLM Scores */}
+                <div className="bg-muted/50 grid grid-cols-2 gap-2 rounded-lg p-3 text-sm">
+                  {Object.entries(result.llm_evaluation.scores).map(
+                    ([key, value]) => (
+                      <div key={key} className="flex justify-between">
+                        <span className="text-muted-foreground">
+                          {t(`scores.${key}`)}
+                        </span>
+                        <span className="font-medium">{value}/10</span>
+                      </div>
+                    ),
+                  )}
+                </div>
+
+                {/* Strengths */}
+                {result.llm_evaluation.strengths.length > 0 && (
+                  <div className="space-y-2">
+                    <div className="flex items-center gap-2 text-sm font-medium text-emerald-600">
+                      <ThumbsUp className="h-4 w-4" />
+                      {t("strengths")}
+                    </div>
+                    <ul className="space-y-1 text-sm">
+                      {result.llm_evaluation.strengths
+                        .slice(0, 3)
+                        .map((s, i) => (
+                          <li key={i} className="text-muted-foreground">
+                            • {s}
+                          </li>
+                        ))}
+                    </ul>
+                  </div>
+                )}
+
+                {/* Weaknesses */}
+                {result.llm_evaluation.weaknesses.length > 0 && (
+                  <div className="space-y-2">
+                    <div className="flex items-center gap-2 text-sm font-medium text-orange-600">
+                      <ThumbsDown className="h-4 w-4" />
+                      {t("weaknesses")}
+                    </div>
+                    <ul className="space-y-1 text-sm">
+                      {result.llm_evaluation.weaknesses
+                        .slice(0, 3)
+                        .map((w, i) => (
+                          <li key={i} className="text-muted-foreground">
+                            • {w}
+                          </li>
+                        ))}
+                    </ul>
+                  </div>
+                )}
+              </div>
+            )}
+
+            {/* Deep Evaluation Button */}
+            {!result.llm_evaluation && (
+              <Button
+                variant="outline"
+                className="w-full"
+                onClick={() => runEvaluation(true)}
+                disabled={deepLoading}
+              >
+                {deepLoading ? (
+                  <>
+                    <Loader2 className="mr-2 h-4 w-4 animate-spin" />
+                    {t("analyzing")}
+                  </>
+                ) : (
+                  <>
+                    <Sparkles className="mr-2 h-4 w-4" />
+                    {t("deepEvaluation")}
+                  </>
+                )}
+              </Button>
+            )}
+          </div>
+        ) : null}
+      </DialogContent>
+    </Dialog>
+  );
+}
diff --git a/web/src/app/chat/components/research-block.tsx b/web/src/app/chat/components/research-block.tsx
index 2880351..f4d1e3f 100644
--- a/web/src/app/chat/components/research-block.tsx
+++ b/web/src/app/chat/components/research-block.tsx
@@ -16,6 +16,7 @@ import { jsPDF } from "jspdf";
 import {
   Check,
   Copy,
+  GraduationCap,
   Headphones,
   Pencil,
   Undo2,
@@ -43,9 +44,10 @@ import {
 } from "~/components/ui/dropdown-menu";
 import { Tabs, TabsContent, TabsList, TabsTrigger } from "~/components/ui/tabs";
 import { useReplay } from "~/core/replay";
-import { closeResearch, listenToPodcast, useStore } from "~/core/store";
+import { closeResearch, getResearchQuery, listenToPodcast, useStore, useSettingsStore } from "~/core/store";
 import { cn } from "~/lib/utils";
 
+import { EvaluationDialog } from "./evaluation-dialog";
 import { ResearchActivitiesBlock } from "./research-activities-block";
 import { ResearchReportBlock } from "./research-report-block";
 
@@ -84,6 +86,7 @@ export function ResearchBlock({
   const [editing, setEditing] = useState(false);
   const [isDownloading, setIsDownloading] = useState(false);
   const [copied, setCopied] = useState(false);
+  const [showEvaluation, setShowEvaluation] = useState(false);
   const handleCopy = useCallback(() => {
     if (!reportId) {
       return;
@@ -676,6 +679,16 @@ ${htmlContent}
                   {copied ? <Check /> : <Copy />}
                 </Button>
               </Tooltip>
+              <Tooltip title={t("evaluateReport")}>
+                <Button
+                  className="text-gray-400"
+                  size="icon"
+                  variant="ghost"
+                  onClick={() => setShowEvaluation(true)}
+                >
+                  <GraduationCap />
+                </Button>
+              </Tooltip>
               <DropdownMenu>
                 <Tooltip title={t("downloadReport")}>
                   <DropdownMenuTrigger asChild>
@@ -796,6 +809,19 @@ ${htmlContent}
           </TabsContent>
         </Tabs>
       </Card>
+
+      {/* Evaluation Dialog */}
+      {reportId && researchId && (
+        <EvaluationDialog
+          open={showEvaluation}
+          onOpenChange={setShowEvaluation}
+          reportContent={
+            useStore.getState().messages.get(reportId)?.content ?? ""
+          }
+          query={getResearchQuery(researchId)}
+          reportStyle={useSettingsStore.getState().general.reportStyle.toLowerCase()}
+        />
+      )}
     </div>
   );
 }
diff --git a/web/src/components/ui/progress.tsx b/web/src/components/ui/progress.tsx
new file mode 100644
index 0000000..248e284
--- /dev/null
+++ b/web/src/components/ui/progress.tsx
@@ -0,0 +1,30 @@
+"use client"
+
+import * as React from "react"
+
+import { cn } from "~/lib/utils"
+
+interface ProgressProps extends React.HTMLAttributes<HTMLDivElement> {
+  value?: number
+}
+
+function Progress({ className, value = 0, ...props }: ProgressProps) {
+  return (
+    <div
+      data-slot="progress"
+      className={cn(
+        "bg-primary/20 relative h-2 w-full overflow-hidden rounded-full",
+        className
+      )}
+      {...props}
+    >
+      <div
+        data-slot="progress-indicator"
+        className="bg-primary h-full transition-all duration-300 ease-in-out"
+        style={{ width: `${Math.min(100, Math.max(0, value))}%` }}
+      />
+    </div>
+  )
+}
+
+export { Progress }
diff --git a/web/src/core/api/evaluate.ts b/web/src/core/api/evaluate.ts
new file mode 100644
index 0000000..9d46f8a
--- /dev/null
+++ b/web/src/core/api/evaluate.ts
@@ -0,0 +1,91 @@
+// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+// SPDX-License-Identifier: MIT
+
+import { resolveServiceURL } from "./resolve-service-url";
+
+/**
+ * Report evaluation API client.
+ */
+
+export interface EvaluationMetrics {
+  word_count: number;
+  citation_count: number;
+  unique_sources: number;
+  image_count: number;
+  section_count: number;
+  section_coverage_score: number;
+  sections_found: string[];
+  sections_missing: string[];
+  has_title: boolean;
+  has_key_points: boolean;
+  has_overview: boolean;
+  has_citations_section: boolean;
+}
+
+export interface LLMEvaluationScores {
+  factual_accuracy: number;
+  completeness: number;
+  coherence: number;
+  relevance: number;
+  citation_quality: number;
+  writing_quality: number;
+}
+
+export interface LLMEvaluation {
+  scores: LLMEvaluationScores;
+  overall_score: number;
+  weighted_score: number;
+  strengths: string[];
+  weaknesses: string[];
+  suggestions: string[];
+}
+
+export interface EvaluationResult {
+  metrics: EvaluationMetrics;
+  score: number;
+  grade: string;
+  llm_evaluation?: LLMEvaluation;
+  summary?: string;
+}
+
+export interface EvaluateReportRequest {
+  content: string;
+  query: string;
+  report_style?: string;
+  use_llm?: boolean;
+}
+
+/**
+ * Evaluate a report's quality using automated metrics and optionally LLM-as-Judge.
+ *
+ * @param content - Report markdown content
+ * @param query - Original research query
+ * @param reportStyle - Report style (academic, news, etc.)
+ * @param useLlm - Whether to use LLM for deep evaluation
+ * @returns Evaluation result with metrics, score, and grade
+ */
+export async function evaluateReport(
+  content: string,
+  query: string,
+  reportStyle?: string,
+  useLlm?: boolean,
+): Promise<EvaluationResult> {
+  const response = await fetch(resolveServiceURL("report/evaluate"), {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({
+      content,
+      query,
+      report_style: reportStyle ?? "default",
+      use_llm: useLlm ?? false,
+    } satisfies EvaluateReportRequest),
+  });
+
+  if (!response.ok) {
+    throw new Error(`Evaluation failed: ${response.statusText}`);
+  }
+
+  return response.json();
+}
diff --git a/web/src/core/api/index.ts b/web/src/core/api/index.ts
index e21a050..2ad2c65 100644
--- a/web/src/core/api/index.ts
+++ b/web/src/core/api/index.ts
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: MIT
 
 export * from "./chat";
+export * from "./evaluate";
 export * from "./mcp";
 export * from "./podcast";
 export * from "./prompt-enhancer";
diff --git a/web/src/core/store/store.ts b/web/src/core/store/store.ts
index 79167fe..357b62d 100644
--- a/web/src/core/store/store.ts
+++ b/web/src/core/store/store.ts
@@ -24,6 +24,7 @@ export const useStore = create<{
   researchPlanIds: Map<string, string>;
   researchReportIds: Map<string, string>;
   researchActivityIds: Map<string, string[]>;
+  researchQueries: Map<string, string>;
   ongoingResearchId: string | null;
   openResearchId: string | null;
 
@@ -42,6 +43,7 @@ export const useStore = create<{
   researchPlanIds: new Map<string, string>(),
   researchReportIds: new Map<string, string>(),
   researchActivityIds: new Map<string, string[]>(),
+  researchQueries: new Map<string, string>(),
   ongoingResearchId: null,
   openResearchId: null,
 
@@ -267,11 +269,17 @@ function getOngoingResearchId() {
 
 function appendResearch(researchId: string) {
   let planMessage: Message | undefined;
+  let userQuery: string | undefined;
   const reversedMessageIds = [...useStore.getState().messageIds].reverse();
   for (const messageId of reversedMessageIds) {
     const message = getMessage(messageId);
-    if (message?.agent === "planner") {
+    if (!planMessage && message?.agent === "planner") {
       planMessage = message;
+    }
+    if (!userQuery && message?.role === "user") {
+      userQuery = message.content;
+    }
+    if (planMessage && userQuery) {
       break;
     }
   }
@@ -288,6 +296,10 @@ function appendResearch(researchId: string) {
       researchId,
       messageIds,
     ),
+    researchQueries: new Map(useStore.getState().researchQueries).set(
+      researchId,
+      userQuery ?? "",
+    ),
   });
 }
 
@@ -394,6 +406,10 @@ export function useResearchMessage(researchId: string) {
   );
 }
 
+export function getResearchQuery(researchId: string): string {
+  return useStore.getState().researchQueries.get(researchId) ?? "";
+}
+
 export function useMessage(messageId: string | null | undefined) {
   return useStore(
     useShallow((state) =>