feat(eval): add report quality evaluation module and UI integration (#776)

* feat(eval): add report quality evaluation module

Addresses issue #773 - How to evaluate generated report quality objectively.

This module provides two evaluation approaches:
1. Automated metrics (no LLM required):
   - Citation count and source diversity
   - Word count compliance per report style
   - Section structure validation
   - Image inclusion tracking

2. LLM-as-Judge evaluation:
   - Factual accuracy scoring
   - Completeness assessment
   - Coherence evaluation
   - Relevance and citation quality checks

The combined evaluator provides a final score (1-10) and letter grade (A+ to F).

Files added:
- src/eval/__init__.py
- src/eval/metrics.py
- src/eval/llm_judge.py
- src/eval/evaluator.py
- tests/unit/eval/test_metrics.py
- tests/unit/eval/test_evaluator.py

* feat(eval): integrate report evaluation with web UI

This commit adds the web UI integration for the evaluation module:

Backend:
- Add EvaluateReportRequest/Response models in src/server/eval_request.py
- Add /api/report/evaluate endpoint to src/server/app.py

Frontend:
- Add evaluateReport API function in web/src/core/api/evaluate.ts
- Create EvaluationDialog component with grade badge, metrics display,
  and optional LLM deep evaluation
- Add evaluation button (graduation cap icon) to research-block.tsx toolbar
- Add i18n translations for English and Chinese

The evaluation UI allows users to:
1. View quick metrics-only evaluation (instant)
2. Optionally run deep LLM-based evaluation for detailed analysis
3. See grade (A+ to F), score (1-10), and metric breakdown

* feat(eval): improve evaluation reliability and add LLM judge tests

- Extract MAX_REPORT_LENGTH constant in llm_judge.py for maintainability
- Add comprehensive unit tests for LLMJudge class (parse_response,
  calculate_weighted_score, evaluate with mocked LLM)
- Pass reportStyle prop to EvaluationDialog for accurate evaluation criteria
- Add researchQueries store map to reliably associate queries with research
- Add getResearchQuery helper to retrieve query by researchId
- Remove unused imports in test_metrics.py

* fix(eval): use resolveServiceURL for evaluate API endpoint

The evaluateReport function was using a relative URL '/api/report/evaluate'
which sent requests to the Next.js server instead of the FastAPI backend.
Changed to use resolveServiceURL() consistent with other API functions.

* fix: improve type accuracy and React hooks in evaluation components

- Fix get_word_count_target return type from Optional[Dict] to Dict since it always returns a value via default fallback
- Fix useEffect dependency issue in EvaluationDialog using useRef to prevent unwanted re-evaluations
- Add aria-label to GradeBadge for screen reader accessibility
This commit is contained in:
Willem Jiang
2025-12-25 21:55:48 +08:00
committed by GitHub
parent 84a7f7815c
commit 8d9d767051
17 changed files with 2103 additions and 2 deletions

21
src/eval/__init__.py Normal file
View File

@@ -0,0 +1,21 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""
Report Quality Evaluation Module for DeerFlow.
This module provides objective methods to evaluate generated report quality,
including automated metrics and LLM-based evaluation.
"""
from .evaluator import ReportEvaluator
from .metrics import ReportMetrics, compute_metrics
from .llm_judge import LLMJudge, evaluate_with_llm
__all__ = [
"ReportEvaluator",
"ReportMetrics",
"compute_metrics",
"LLMJudge",
"evaluate_with_llm",
]

249
src/eval/evaluator.py Normal file
View File

@@ -0,0 +1,249 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""
Combined report evaluator orchestrating both automated metrics and LLM evaluation.
"""
import logging
from dataclasses import dataclass
from typing import Any, Dict, Optional
from .llm_judge import EvaluationResult, LLMJudge
from .metrics import ReportMetrics, compute_metrics, get_word_count_target
logger = logging.getLogger(__name__)
@dataclass
class CombinedEvaluation:
"""Combined evaluation results from metrics and LLM judge."""
metrics: ReportMetrics
llm_evaluation: Optional[EvaluationResult]
final_score: float
grade: str
summary: str
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary format."""
return {
"metrics": self.metrics.to_dict(),
"llm_evaluation": (
self.llm_evaluation.to_dict() if self.llm_evaluation else None
),
"final_score": self.final_score,
"grade": self.grade,
"summary": self.summary,
}
def score_to_grade(score: float) -> str:
"""Convert numeric score to letter grade."""
if score >= 9.0:
return "A+"
elif score >= 8.5:
return "A"
elif score >= 8.0:
return "A-"
elif score >= 7.5:
return "B+"
elif score >= 7.0:
return "B"
elif score >= 6.5:
return "B-"
elif score >= 6.0:
return "C+"
elif score >= 5.5:
return "C"
elif score >= 5.0:
return "C-"
elif score >= 4.0:
return "D"
else:
return "F"
class ReportEvaluator:
"""
Combined report evaluator using both automated metrics and LLM-as-Judge.
This evaluator provides comprehensive report quality assessment by:
1. Computing automated metrics (fast, deterministic)
2. Running LLM-based evaluation (nuanced, contextual)
3. Combining both for a final score and grade
"""
def __init__(self, llm: Any = None, use_llm: bool = True):
"""
Initialize the evaluator.
Args:
llm: Optional LLM instance for LLM-as-Judge evaluation
use_llm: Whether to use LLM evaluation (can be disabled for speed)
"""
self.use_llm = use_llm
self.llm_judge = LLMJudge(llm=llm) if use_llm else None
def _compute_metrics_score(
self, metrics: ReportMetrics, report_style: str
) -> float:
"""
Convert automated metrics to a 0-10 score.
Scoring breakdown:
- Section coverage: 30%
- Citation quality: 25%
- Word count compliance: 20%
- Source diversity: 15%
- Image inclusion: 10%
"""
score = 0.0
section_score = metrics.section_coverage_score * 10
score += section_score * 0.30
citation_score = min(metrics.citation_count / 10, 1.0) * 10
score += citation_score * 0.25
target = get_word_count_target(report_style)
if target:
if target["min"] <= metrics.word_count <= target["max"]:
word_score = 10.0
elif metrics.word_count < target["min"]:
word_score = (metrics.word_count / target["min"]) * 8
else:
excess_ratio = metrics.word_count / target["max"]
word_score = max(10 - (excess_ratio - 1) * 5, 5)
score += word_score * 0.20
diversity_score = min(metrics.unique_sources / 5, 1.0) * 10
score += diversity_score * 0.15
image_score = min(metrics.image_count / 3, 1.0) * 10
score += image_score * 0.10
return round(score, 2)
def _generate_summary(
self,
metrics: ReportMetrics,
llm_eval: Optional[EvaluationResult],
final_score: float,
grade: str,
) -> str:
"""Generate a human-readable evaluation summary."""
lines = [f"Report Grade: {grade} ({final_score}/10)", ""]
lines.append("**Automated Metrics:**")
lines.append(f"- Word Count: {metrics.word_count}")
lines.append(f"- Citations: {metrics.citation_count}")
lines.append(f"- Unique Sources: {metrics.unique_sources}")
lines.append(f"- Images: {metrics.image_count}")
lines.append(
f"- Section Coverage: {metrics.section_coverage_score * 100:.0f}%"
)
if metrics.sections_missing:
lines.append(f"- Missing Sections: {', '.join(metrics.sections_missing)}")
if llm_eval:
lines.append("")
lines.append("**LLM Evaluation:**")
for criterion, score in llm_eval.scores.items():
lines.append(f"- {criterion.replace('_', ' ').title()}: {score}/10")
if llm_eval.strengths:
lines.append("")
lines.append("**Strengths:**")
for strength in llm_eval.strengths[:3]:
lines.append(f"- {strength}")
if llm_eval.weaknesses:
lines.append("")
lines.append("**Areas for Improvement:**")
for weakness in llm_eval.weaknesses[:3]:
lines.append(f"- {weakness}")
return "\n".join(lines)
async def evaluate(
self,
report: str,
query: str,
report_style: str = "default",
) -> CombinedEvaluation:
"""
Evaluate a report using both metrics and LLM.
Args:
report: The report text to evaluate
query: The original research query
report_style: The style of report
Returns:
CombinedEvaluation with full results
"""
metrics = compute_metrics(report, report_style)
metrics_score = self._compute_metrics_score(metrics, report_style)
llm_eval = None
if self.use_llm and self.llm_judge:
try:
llm_eval = await self.llm_judge.evaluate(report, query, report_style)
except Exception as e:
logger.warning(f"LLM evaluation failed, using metrics only: {e}")
if llm_eval and llm_eval.overall_score > 0:
final_score = (metrics_score * 0.4) + (llm_eval.weighted_score * 0.6)
else:
final_score = metrics_score
final_score = round(final_score, 2)
grade = score_to_grade(final_score)
summary = self._generate_summary(metrics, llm_eval, final_score, grade)
return CombinedEvaluation(
metrics=metrics,
llm_evaluation=llm_eval,
final_score=final_score,
grade=grade,
summary=summary,
)
def evaluate_sync(
self,
report: str,
query: str,
report_style: str = "default",
) -> CombinedEvaluation:
"""Synchronous version of evaluate."""
import asyncio
return asyncio.run(self.evaluate(report, query, report_style))
def evaluate_metrics_only(
self,
report: str,
report_style: str = "default",
) -> Dict[str, Any]:
"""
Quick evaluation using only automated metrics (no LLM).
Args:
report: The report text to evaluate
report_style: The style of report
Returns:
Dictionary with metrics and score
"""
metrics = compute_metrics(report, report_style)
metrics_score = self._compute_metrics_score(metrics, report_style)
grade = score_to_grade(metrics_score)
return {
"metrics": metrics.to_dict(),
"score": metrics_score,
"grade": grade,
}

282
src/eval/llm_judge.py Normal file
View File

@@ -0,0 +1,282 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""
LLM-as-Judge evaluation for report quality.
Uses an LLM to evaluate reports on multiple quality dimensions,
providing more nuanced assessment than automated metrics alone.
"""
import json
import logging
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from langchain_core.messages import HumanMessage, SystemMessage
logger = logging.getLogger(__name__)
# Maximum characters of report content to send to the LLM for evaluation.
# This limit prevents exceeding LLM context windows and controls token usage.
MAX_REPORT_LENGTH = 15000
EVALUATION_CRITERIA = {
"factual_accuracy": {
"description": "Are claims supported by cited sources? Is information accurate and verifiable?",
"weight": 0.25,
},
"completeness": {
"description": "Does the report comprehensively cover all aspects of the topic?",
"weight": 0.20,
},
"coherence": {
"description": "Is the report logically structured, well-organized, and easy to follow?",
"weight": 0.20,
},
"relevance": {
"description": "Does the content directly address the research question without unnecessary tangents?",
"weight": 0.15,
},
"citation_quality": {
"description": "Are sources credible, diverse, and properly cited?",
"weight": 0.10,
},
"writing_quality": {
"description": "Is the writing clear, professional, and appropriate for the target audience?",
"weight": 0.10,
},
}
JUDGE_SYSTEM_PROMPT = """You are an expert report quality evaluator. Your task is to objectively assess the quality of research reports.
Evaluate the report on the following criteria, scoring each from 1-10:
1. **Factual Accuracy** (1-10): Are claims supported by cited sources? Is information accurate?
2. **Completeness** (1-10): Does the report cover all aspects of the topic comprehensively?
3. **Coherence** (1-10): Is the report logically structured and easy to follow?
4. **Relevance** (1-10): Does content directly address the research question?
5. **Citation Quality** (1-10): Are sources credible, diverse, and properly cited?
6. **Writing Quality** (1-10): Is the writing clear and appropriate for the audience?
Respond ONLY with a valid JSON object in this exact format:
{
"scores": {
"factual_accuracy": <1-10>,
"completeness": <1-10>,
"coherence": <1-10>,
"relevance": <1-10>,
"citation_quality": <1-10>,
"writing_quality": <1-10>
},
"overall_score": <1-10>,
"strengths": ["strength1", "strength2"],
"weaknesses": ["weakness1", "weakness2"],
"suggestions": ["suggestion1", "suggestion2"]
}
Be objective and thorough in your evaluation."""
@dataclass
class EvaluationResult:
"""Container for LLM evaluation results."""
scores: Dict[str, int]
overall_score: float
weighted_score: float
strengths: List[str]
weaknesses: List[str]
suggestions: List[str]
raw_response: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
"""Convert evaluation result to dictionary."""
return {
"scores": self.scores,
"overall_score": self.overall_score,
"weighted_score": self.weighted_score,
"strengths": self.strengths,
"weaknesses": self.weaknesses,
"suggestions": self.suggestions,
}
class LLMJudge:
"""LLM-based report quality evaluator."""
def __init__(self, llm: Any = None):
"""
Initialize the LLM Judge.
Args:
llm: LangChain-compatible LLM instance. If None, will be created on demand.
"""
self._llm = llm
def _get_llm(self):
"""Get or create the LLM instance."""
if self._llm is None:
from src.llms.llm import get_llm_by_type
self._llm = get_llm_by_type("basic")
return self._llm
def _calculate_weighted_score(self, scores: Dict[str, int]) -> float:
"""Calculate weighted average score based on criteria weights."""
total_weight = 0
weighted_sum = 0
for criterion, score in scores.items():
if criterion in EVALUATION_CRITERIA:
weight = EVALUATION_CRITERIA[criterion]["weight"]
weighted_sum += score * weight
total_weight += weight
if total_weight > 0:
return round(weighted_sum / total_weight, 2)
return 0.0
def _parse_response(self, response: str) -> Dict[str, Any]:
"""Parse LLM response into structured format."""
try:
json_match = response
if "```json" in response:
json_match = response.split("```json")[1].split("```")[0]
elif "```" in response:
json_match = response.split("```")[1].split("```")[0]
return json.loads(json_match.strip())
except (json.JSONDecodeError, IndexError) as e:
logger.warning(f"Failed to parse LLM response: {e}")
return {
"scores": {
"factual_accuracy": 5,
"completeness": 5,
"coherence": 5,
"relevance": 5,
"citation_quality": 5,
"writing_quality": 5,
},
"overall_score": 5,
"strengths": ["Unable to parse evaluation"],
"weaknesses": ["Evaluation parsing failed"],
"suggestions": ["Please re-run evaluation"],
}
async def evaluate(
self,
report: str,
query: str,
report_style: str = "default",
) -> EvaluationResult:
"""
Evaluate a report using LLM-as-Judge.
Args:
report: The report text to evaluate
query: The original research query
report_style: The style of report for context
Returns:
EvaluationResult with scores and feedback
"""
llm = self._get_llm()
user_prompt = f"""Please evaluate the following research report.
**Original Research Query:** {query}
**Report Style:** {report_style}
**Report to Evaluate:**
{report[:MAX_REPORT_LENGTH]}
Provide your evaluation in the specified JSON format."""
messages = [
SystemMessage(content=JUDGE_SYSTEM_PROMPT),
HumanMessage(content=user_prompt),
]
try:
response = await llm.ainvoke(messages)
response_text = (
response.content if hasattr(response, "content") else str(response)
)
parsed = self._parse_response(response_text)
scores = parsed.get("scores", {})
weighted_score = self._calculate_weighted_score(scores)
return EvaluationResult(
scores=scores,
overall_score=parsed.get("overall_score", 5),
weighted_score=weighted_score,
strengths=parsed.get("strengths", []),
weaknesses=parsed.get("weaknesses", []),
suggestions=parsed.get("suggestions", []),
raw_response=response_text,
)
except Exception as e:
logger.error(f"LLM evaluation failed: {e}")
return EvaluationResult(
scores={
"factual_accuracy": 0,
"completeness": 0,
"coherence": 0,
"relevance": 0,
"citation_quality": 0,
"writing_quality": 0,
},
overall_score=0,
weighted_score=0,
strengths=[],
weaknesses=[f"Evaluation failed: {str(e)}"],
suggestions=["Please retry evaluation"],
)
def evaluate_sync(
self,
report: str,
query: str,
report_style: str = "default",
) -> EvaluationResult:
"""
Synchronous version of evaluate.
Args:
report: The report text to evaluate
query: The original research query
report_style: The style of report for context
Returns:
EvaluationResult with scores and feedback
"""
import asyncio
return asyncio.run(self.evaluate(report, query, report_style))
async def evaluate_with_llm(
report: str,
query: str,
report_style: str = "default",
llm: Any = None,
) -> EvaluationResult:
"""
Convenience function to evaluate a report with LLM.
Args:
report: The report text to evaluate
query: The original research query
report_style: The style of report for context
llm: Optional LLM instance to use
Returns:
EvaluationResult with scores and feedback
"""
judge = LLMJudge(llm=llm)
return await judge.evaluate(report, query, report_style)

229
src/eval/metrics.py Normal file
View File

@@ -0,0 +1,229 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""
Automated metrics for report quality evaluation.
These metrics can be computed without LLM calls, providing fast and
deterministic quality assessment.
"""
import re
from dataclasses import dataclass, field
from typing import Dict, List, Optional
from urllib.parse import urlparse
@dataclass
class ReportMetrics:
"""Container for computed report metrics."""
word_count: int = 0
citation_count: int = 0
unique_sources: int = 0
image_count: int = 0
section_count: int = 0
sections_found: List[str] = field(default_factory=list)
sections_missing: List[str] = field(default_factory=list)
section_coverage_score: float = 0.0
has_title: bool = False
has_key_points: bool = False
has_overview: bool = False
has_citations_section: bool = False
def to_dict(self) -> Dict:
"""Convert metrics to dictionary."""
return {
"word_count": self.word_count,
"citation_count": self.citation_count,
"unique_sources": self.unique_sources,
"image_count": self.image_count,
"section_count": self.section_count,
"sections_found": self.sections_found,
"sections_missing": self.sections_missing,
"section_coverage_score": self.section_coverage_score,
"has_title": self.has_title,
"has_key_points": self.has_key_points,
"has_overview": self.has_overview,
"has_citations_section": self.has_citations_section,
}
# Required sections for different report styles
REPORT_STYLE_SECTIONS = {
"default": [
"title",
"key_points",
"overview",
"detailed_analysis",
"key_citations",
],
"academic": [
"title",
"key_points",
"overview",
"detailed_analysis",
"literature_review",
"methodology",
"key_citations",
],
"news": [
"title",
"key_points",
"overview",
"detailed_analysis",
"key_citations",
],
"popular_science": [
"title",
"key_points",
"overview",
"detailed_analysis",
"key_citations",
],
"social_media": [
"title",
"key_points",
"overview",
"key_citations",
],
"strategic_investment": [
"title",
"key_points",
"overview",
"detailed_analysis",
"executive_summary",
"market_analysis",
"technology_analysis",
"investment_recommendations",
"key_citations",
],
}
# Section name patterns for detection (supports both English and Chinese)
SECTION_PATTERNS = {
"title": r"^#\s+.+",
"key_points": r"(?:key\s*points|要点|关键发现|核心观点)",
"overview": r"(?:overview|概述|简介|背景)",
"detailed_analysis": r"(?:detailed\s*analysis|详细分析|深度分析|分析)",
"key_citations": r"(?:key\s*citations|references|参考文献|引用|来源)",
"literature_review": r"(?:literature\s*review|文献综述|研究回顾)",
"methodology": r"(?:methodology|方法论|研究方法)",
"executive_summary": r"(?:executive\s*summary|执行摘要|投资建议)",
"market_analysis": r"(?:market\s*analysis|市场分析|产业分析)",
"technology_analysis": r"(?:technology|技术.*(?:分析|解析|深度))",
"investment_recommendations": r"(?:investment.*recommend|投资建议|投资评级)",
}
def count_words(text: str) -> int:
"""Count words in text, handling both English and Chinese."""
english_words = len(re.findall(r"\b[a-zA-Z]+\b", text))
chinese_chars = len(re.findall(r"[\u4e00-\u9fff]", text))
return english_words + chinese_chars
def count_citations(text: str) -> int:
"""Count markdown-style citations [text](url)."""
pattern = r"\[.+?\]\(https?://[^\s\)]+\)"
return len(re.findall(pattern, text))
def extract_domains(text: str) -> List[str]:
"""Extract unique domains from URLs in the text."""
url_pattern = r"https?://([^\s\)\]]+)"
urls = re.findall(url_pattern, text)
domains = set()
for url in urls:
try:
parsed = urlparse(f"http://{url}")
domain = parsed.netloc or url.split("/")[0]
domain = domain.lower().replace("www.", "")
if domain:
domains.add(domain)
except Exception:
continue
return list(domains)
def count_images(text: str) -> int:
"""Count markdown images ![alt](url)."""
pattern = r"!\[.*?\]\(.+?\)"
return len(re.findall(pattern, text))
def detect_sections(text: str, report_style: str = "default") -> Dict[str, bool]:
"""Detect which sections are present in the report."""
required_sections = REPORT_STYLE_SECTIONS.get(
report_style, REPORT_STYLE_SECTIONS["default"]
)
detected = {}
text_lower = text.lower()
for section in required_sections:
pattern = SECTION_PATTERNS.get(section, section.replace("_", r"\s*"))
if section == "title":
detected[section] = bool(re.search(pattern, text, re.MULTILINE))
else:
detected[section] = bool(
re.search(pattern, text_lower, re.IGNORECASE | re.MULTILINE)
)
return detected
def compute_metrics(
report: str, report_style: str = "default", target_word_count: Optional[int] = None
) -> ReportMetrics:
"""
Compute automated metrics for a report.
Args:
report: The report text in markdown format
report_style: The style of report (academic, news, etc.)
target_word_count: Optional target word count for compliance check
Returns:
ReportMetrics object with computed values
"""
metrics = ReportMetrics()
metrics.word_count = count_words(report)
metrics.citation_count = count_citations(report)
domains = extract_domains(report)
metrics.unique_sources = len(domains)
metrics.image_count = count_images(report)
sections_detected = detect_sections(report, report_style)
metrics.sections_found = [s for s, found in sections_detected.items() if found]
metrics.sections_missing = [
s for s, found in sections_detected.items() if not found
]
metrics.section_count = len(metrics.sections_found)
total_sections = len(sections_detected)
if total_sections > 0:
metrics.section_coverage_score = len(metrics.sections_found) / total_sections
metrics.has_title = sections_detected.get("title", False)
metrics.has_key_points = sections_detected.get("key_points", False)
metrics.has_overview = sections_detected.get("overview", False)
metrics.has_citations_section = sections_detected.get("key_citations", False)
return metrics
def get_word_count_target(report_style: str) -> Dict[str, int]:
"""Get target word count range for a report style."""
targets = {
"strategic_investment": {"min": 10000, "max": 15000},
"academic": {"min": 3000, "max": 8000},
"news": {"min": 800, "max": 2000},
"popular_science": {"min": 1500, "max": 4000},
"social_media": {"min": 500, "max": 1500},
"default": {"min": 1000, "max": 5000},
}
return targets.get(report_style, targets["default"])

View File

@@ -35,6 +35,7 @@ from src.podcast.graph.builder import build_graph as build_podcast_graph
from src.ppt.graph.builder import build_graph as build_ppt_graph
from src.prompt_enhancer.graph.builder import build_graph as build_prompt_enhancer_graph
from src.prose.graph.builder import build_graph as build_prose_graph
from src.eval import ReportEvaluator
from src.rag.builder import build_retriever
from src.rag.milvus import load_examples as load_milvus_examples
from src.rag.qdrant import load_examples as load_qdrant_examples
@@ -47,6 +48,7 @@ from src.server.chat_request import (
GenerateProseRequest,
TTSRequest,
)
from src.server.eval_request import EvaluateReportRequest, EvaluateReportResponse
from src.server.config_request import ConfigResponse
from src.server.mcp_request import MCPServerMetadataRequest, MCPServerMetadataResponse
from src.server.mcp_utils import load_mcp_tools
@@ -946,6 +948,39 @@ async def generate_prose(request: GenerateProseRequest):
raise HTTPException(status_code=500, detail=INTERNAL_SERVER_ERROR_DETAIL)
@app.post("/api/report/evaluate", response_model=EvaluateReportResponse)
async def evaluate_report(request: EvaluateReportRequest):
"""Evaluate report quality using automated metrics and optionally LLM-as-Judge."""
try:
evaluator = ReportEvaluator(use_llm=request.use_llm)
if request.use_llm:
result = await evaluator.evaluate(
request.content, request.query, request.report_style or "default"
)
return EvaluateReportResponse(
metrics=result.metrics.to_dict(),
score=result.final_score,
grade=result.grade,
llm_evaluation=result.llm_evaluation.to_dict()
if result.llm_evaluation
else None,
summary=result.summary,
)
else:
result = evaluator.evaluate_metrics_only(
request.content, request.report_style or "default"
)
return EvaluateReportResponse(
metrics=result["metrics"],
score=result["score"],
grade=result["grade"],
)
except Exception as e:
logger.exception(f"Error occurred during report evaluation: {str(e)}")
raise HTTPException(status_code=500, detail=INTERNAL_SERVER_ERROR_DETAIL)
@app.post("/api/prompt/enhance")
async def enhance_prompt(request: EnhancePromptRequest):
try:

View File

@@ -0,0 +1,71 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""Request models for report evaluation endpoint."""
from typing import Optional
from pydantic import BaseModel, Field
class EvaluateReportRequest(BaseModel):
"""Request model for report evaluation."""
content: str = Field(description="Report markdown content to evaluate")
query: str = Field(description="Original research query")
report_style: Optional[str] = Field(
default="default", description="Report style (academic, news, etc.)"
)
use_llm: bool = Field(
default=False,
description="Whether to use LLM for deep evaluation (slower but more detailed)",
)
class EvaluationMetrics(BaseModel):
"""Automated metrics result."""
word_count: int
citation_count: int
unique_sources: int
image_count: int
section_count: int
section_coverage_score: float
sections_found: list[str]
sections_missing: list[str]
has_title: bool
has_key_points: bool
has_overview: bool
has_citations_section: bool
class LLMEvaluationScores(BaseModel):
"""LLM evaluation scores."""
factual_accuracy: int = 0
completeness: int = 0
coherence: int = 0
relevance: int = 0
citation_quality: int = 0
writing_quality: int = 0
class LLMEvaluation(BaseModel):
"""LLM evaluation result."""
scores: LLMEvaluationScores
overall_score: float
weighted_score: float
strengths: list[str]
weaknesses: list[str]
suggestions: list[str]
class EvaluateReportResponse(BaseModel):
"""Response model for report evaluation."""
metrics: EvaluationMetrics
score: float
grade: str
llm_evaluation: Optional[LLMEvaluation] = None
summary: Optional[str] = None

View File

@@ -0,0 +1,2 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT

View File

@@ -0,0 +1,489 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""Unit tests for the combined report evaluator."""
import json
from unittest.mock import AsyncMock, MagicMock
import pytest
from src.eval.evaluator import CombinedEvaluation, ReportEvaluator, score_to_grade
from src.eval.llm_judge import (
EVALUATION_CRITERIA,
MAX_REPORT_LENGTH,
EvaluationResult,
LLMJudge,
)
from src.eval.metrics import ReportMetrics
class TestScoreToGrade:
"""Tests for score to grade conversion."""
def test_excellent_scores(self):
assert score_to_grade(9.5) == "A+"
assert score_to_grade(9.0) == "A+"
assert score_to_grade(8.7) == "A"
assert score_to_grade(8.5) == "A"
assert score_to_grade(8.2) == "A-"
def test_good_scores(self):
assert score_to_grade(7.8) == "B+"
assert score_to_grade(7.5) == "B+"
assert score_to_grade(7.2) == "B"
assert score_to_grade(7.0) == "B"
assert score_to_grade(6.7) == "B-"
def test_average_scores(self):
assert score_to_grade(6.2) == "C+"
assert score_to_grade(5.8) == "C"
assert score_to_grade(5.5) == "C"
assert score_to_grade(5.2) == "C-"
def test_poor_scores(self):
assert score_to_grade(4.5) == "D"
assert score_to_grade(4.0) == "D"
assert score_to_grade(3.0) == "F"
assert score_to_grade(1.0) == "F"
class TestReportEvaluator:
"""Tests for ReportEvaluator class."""
@pytest.fixture
def evaluator(self):
"""Create evaluator without LLM for metrics-only tests."""
return ReportEvaluator(use_llm=False)
@pytest.fixture
def sample_report(self):
"""Sample report for testing."""
return """
# Comprehensive Research Report
## Key Points
- Important finding number one with significant implications
- Critical discovery that changes our understanding
- Key insight that provides actionable recommendations
- Notable observation from the research data
## Overview
This report presents a comprehensive analysis of the research topic.
The findings are based on extensive data collection and analysis.
## Detailed Analysis
### Section 1: Background
The background of this research involves multiple factors.
[Source 1](https://example.com/source1) provides foundational context.
### Section 2: Methodology
Our methodology follows established research practices.
[Source 2](https://research.org/methods) outlines the approach.
### Section 3: Findings
The key findings include several important discoveries.
![Research Data](https://example.com/chart.png)
[Source 3](https://academic.edu/paper) supports these conclusions.
## Key Citations
- [Example Source](https://example.com/source1)
- [Research Methods](https://research.org/methods)
- [Academic Paper](https://academic.edu/paper)
- [Additional Reference](https://reference.com/doc)
"""
def test_evaluate_metrics_only(self, evaluator, sample_report):
"""Test metrics-only evaluation."""
result = evaluator.evaluate_metrics_only(sample_report)
assert "metrics" in result
assert "score" in result
assert "grade" in result
assert result["score"] > 0
assert result["grade"] in ["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D", "F"]
def test_evaluate_metrics_only_structure(self, evaluator, sample_report):
"""Test that metrics contain expected fields."""
result = evaluator.evaluate_metrics_only(sample_report)
metrics = result["metrics"]
assert "word_count" in metrics
assert "citation_count" in metrics
assert "unique_sources" in metrics
assert "image_count" in metrics
assert "section_coverage_score" in metrics
def test_evaluate_minimal_report(self, evaluator):
"""Test evaluation of minimal report."""
minimal_report = "Just some text."
result = evaluator.evaluate_metrics_only(minimal_report)
assert result["score"] < 5.0
assert result["grade"] in ["D", "F"]
def test_metrics_score_calculation(self, evaluator):
"""Test that metrics score is calculated correctly."""
good_report = """
# Title
## Key Points
- Point 1
- Point 2
## Overview
Overview content here.
## Detailed Analysis
Analysis with [cite](https://a.com) and [cite2](https://b.com)
and [cite3](https://c.com) and more [refs](https://d.com).
![Image](https://img.com/1.png)
## Key Citations
- [A](https://a.com)
- [B](https://b.com)
"""
result = evaluator.evaluate_metrics_only(good_report)
assert result["score"] > 5.0
def test_combined_evaluation_to_dict(self):
"""Test CombinedEvaluation to_dict method."""
metrics = ReportMetrics(
word_count=1000,
citation_count=5,
unique_sources=3,
)
evaluation = CombinedEvaluation(
metrics=metrics,
llm_evaluation=None,
final_score=7.5,
grade="B+",
summary="Test summary",
)
result = evaluation.to_dict()
assert result["final_score"] == 7.5
assert result["grade"] == "B+"
assert result["metrics"]["word_count"] == 1000
class TestReportEvaluatorIntegration:
"""Integration tests for evaluator (may require LLM)."""
@pytest.mark.asyncio
async def test_full_evaluation_without_llm(self):
"""Test full evaluation with LLM disabled."""
evaluator = ReportEvaluator(use_llm=False)
report = """
# Test Report
## Key Points
- Key point 1
## Overview
Test overview.
## Key Citations
- [Test](https://test.com)
"""
result = await evaluator.evaluate(report, "test query")
assert isinstance(result, CombinedEvaluation)
assert result.final_score > 0
assert result.grade is not None
assert result.summary is not None
assert result.llm_evaluation is None
class TestLLMJudgeParseResponse:
"""Tests for LLMJudge._parse_response method."""
@pytest.fixture
def judge(self):
"""Create LLMJudge with mock LLM."""
return LLMJudge(llm=MagicMock())
@pytest.fixture
def valid_response_data(self):
"""Valid evaluation response data."""
return {
"scores": {
"factual_accuracy": 8,
"completeness": 7,
"coherence": 9,
"relevance": 8,
"citation_quality": 6,
"writing_quality": 8,
},
"overall_score": 8,
"strengths": ["Well researched", "Clear structure"],
"weaknesses": ["Could use more citations"],
"suggestions": ["Add more sources"],
}
def test_parse_valid_json(self, judge, valid_response_data):
"""Test parsing valid JSON response."""
response = json.dumps(valid_response_data)
result = judge._parse_response(response)
assert result["scores"]["factual_accuracy"] == 8
assert result["overall_score"] == 8
assert "Well researched" in result["strengths"]
def test_parse_json_in_markdown_block(self, judge, valid_response_data):
"""Test parsing JSON wrapped in markdown code block."""
response = f"```json\n{json.dumps(valid_response_data)}\n```"
result = judge._parse_response(response)
assert result["scores"]["coherence"] == 9
assert result["overall_score"] == 8
def test_parse_json_in_generic_code_block(self, judge, valid_response_data):
"""Test parsing JSON in generic code block."""
response = f"```\n{json.dumps(valid_response_data)}\n```"
result = judge._parse_response(response)
assert result["scores"]["relevance"] == 8
def test_parse_malformed_json_returns_defaults(self, judge):
"""Test that malformed JSON returns default scores."""
response = "This is not valid JSON at all"
result = judge._parse_response(response)
assert result["scores"]["factual_accuracy"] == 5
assert result["scores"]["completeness"] == 5
assert result["overall_score"] == 5
assert "Unable to parse evaluation" in result["strengths"]
assert "Evaluation parsing failed" in result["weaknesses"]
def test_parse_incomplete_json(self, judge):
"""Test parsing incomplete JSON."""
response = '{"scores": {"factual_accuracy": 8}' # Missing closing braces
result = judge._parse_response(response)
# Should return defaults due to parse failure
assert result["overall_score"] == 5
def test_parse_json_with_extra_text(self, judge, valid_response_data):
"""Test parsing JSON with surrounding text."""
response = f"Here is my evaluation:\n```json\n{json.dumps(valid_response_data)}\n```\nHope this helps!"
result = judge._parse_response(response)
assert result["scores"]["factual_accuracy"] == 8
class TestLLMJudgeCalculateWeightedScore:
"""Tests for LLMJudge._calculate_weighted_score method."""
@pytest.fixture
def judge(self):
"""Create LLMJudge with mock LLM."""
return LLMJudge(llm=MagicMock())
def test_calculate_with_all_scores(self, judge):
"""Test weighted score calculation with all criteria."""
scores = {
"factual_accuracy": 10, # weight 0.25
"completeness": 10, # weight 0.20
"coherence": 10, # weight 0.20
"relevance": 10, # weight 0.15
"citation_quality": 10, # weight 0.10
"writing_quality": 10, # weight 0.10
}
result = judge._calculate_weighted_score(scores)
assert result == 10.0
def test_calculate_with_varied_scores(self, judge):
"""Test weighted score with varied scores."""
scores = {
"factual_accuracy": 8, # 8 * 0.25 = 2.0
"completeness": 6, # 6 * 0.20 = 1.2
"coherence": 7, # 7 * 0.20 = 1.4
"relevance": 9, # 9 * 0.15 = 1.35
"citation_quality": 5, # 5 * 0.10 = 0.5
"writing_quality": 8, # 8 * 0.10 = 0.8
}
# Total: 7.25
result = judge._calculate_weighted_score(scores)
assert result == 7.25
def test_calculate_with_partial_scores(self, judge):
"""Test weighted score with only some criteria."""
scores = {
"factual_accuracy": 8, # weight 0.25
"completeness": 6, # weight 0.20
}
# (8 * 0.25 + 6 * 0.20) / (0.25 + 0.20) = 3.2 / 0.45 = 7.11
result = judge._calculate_weighted_score(scores)
assert abs(result - 7.11) < 0.01
def test_calculate_with_unknown_criteria(self, judge):
"""Test that unknown criteria are ignored."""
scores = {
"factual_accuracy": 10,
"unknown_criterion": 1, # Should be ignored
}
result = judge._calculate_weighted_score(scores)
assert result == 10.0
def test_calculate_with_empty_scores(self, judge):
"""Test with empty scores dict."""
result = judge._calculate_weighted_score({})
assert result == 0.0
def test_weights_sum_to_one(self):
"""Verify that all criteria weights sum to 1.0."""
total_weight = sum(c["weight"] for c in EVALUATION_CRITERIA.values())
assert abs(total_weight - 1.0) < 0.001
class TestLLMJudgeEvaluate:
"""Tests for LLMJudge.evaluate method with mocked LLM."""
@pytest.fixture
def valid_llm_response(self):
"""Create a valid LLM response."""
return json.dumps(
{
"scores": {
"factual_accuracy": 8,
"completeness": 7,
"coherence": 9,
"relevance": 8,
"citation_quality": 7,
"writing_quality": 8,
},
"overall_score": 8,
"strengths": ["Comprehensive coverage", "Well structured"],
"weaknesses": ["Some claims need more support"],
"suggestions": ["Add more academic sources"],
}
)
@pytest.mark.asyncio
async def test_successful_evaluation(self, valid_llm_response):
"""Test successful LLM evaluation."""
mock_llm = AsyncMock()
mock_response = MagicMock()
mock_response.content = valid_llm_response
mock_llm.ainvoke.return_value = mock_response
judge = LLMJudge(llm=mock_llm)
result = await judge.evaluate("Test report", "Test query")
assert isinstance(result, EvaluationResult)
assert result.scores["factual_accuracy"] == 8
assert result.overall_score == 8
assert result.weighted_score > 0
assert "Comprehensive coverage" in result.strengths
assert result.raw_response == valid_llm_response
@pytest.mark.asyncio
async def test_evaluation_with_llm_failure(self):
"""Test that LLM failures are handled gracefully."""
mock_llm = AsyncMock()
mock_llm.ainvoke.side_effect = Exception("LLM service unavailable")
judge = LLMJudge(llm=mock_llm)
result = await judge.evaluate("Test report", "Test query")
assert isinstance(result, EvaluationResult)
assert result.overall_score == 0
assert result.weighted_score == 0
assert all(score == 0 for score in result.scores.values())
assert any("failed" in w.lower() for w in result.weaknesses)
@pytest.mark.asyncio
async def test_evaluation_with_malformed_response(self):
"""Test handling of malformed LLM response."""
mock_llm = AsyncMock()
mock_response = MagicMock()
mock_response.content = "I cannot evaluate this report properly."
mock_llm.ainvoke.return_value = mock_response
judge = LLMJudge(llm=mock_llm)
result = await judge.evaluate("Test report", "Test query")
# Should return default scores
assert result.scores["factual_accuracy"] == 5
assert result.overall_score == 5
@pytest.mark.asyncio
async def test_evaluation_passes_report_style(self):
"""Test that report_style is passed to LLM."""
mock_llm = AsyncMock()
mock_response = MagicMock()
mock_response.content = json.dumps(
{
"scores": {k: 7 for k in EVALUATION_CRITERIA.keys()},
"overall_score": 7,
"strengths": [],
"weaknesses": [],
"suggestions": [],
}
)
mock_llm.ainvoke.return_value = mock_response
judge = LLMJudge(llm=mock_llm)
await judge.evaluate("Test report", "Test query", report_style="academic")
# Verify the prompt contains the report style
call_args = mock_llm.ainvoke.call_args
messages = call_args[0][0]
user_message_content = messages[1].content
assert "academic" in user_message_content
@pytest.mark.asyncio
async def test_evaluation_truncates_long_reports(self):
"""Test that very long reports are truncated."""
mock_llm = AsyncMock()
mock_response = MagicMock()
mock_response.content = json.dumps(
{
"scores": {k: 7 for k in EVALUATION_CRITERIA.keys()},
"overall_score": 7,
"strengths": [],
"weaknesses": [],
"suggestions": [],
}
)
mock_llm.ainvoke.return_value = mock_response
judge = LLMJudge(llm=mock_llm)
long_report = "x" * (MAX_REPORT_LENGTH + 5000)
await judge.evaluate(long_report, "Test query")
call_args = mock_llm.ainvoke.call_args
messages = call_args[0][0]
user_message_content = messages[1].content
# The report content in the message should be truncated to MAX_REPORT_LENGTH
assert len(user_message_content) < len(long_report) + 500
class TestEvaluationResult:
"""Tests for EvaluationResult dataclass."""
def test_to_dict(self):
"""Test EvaluationResult.to_dict method."""
result = EvaluationResult(
scores={"factual_accuracy": 8, "completeness": 7},
overall_score=7.5,
weighted_score=7.6,
strengths=["Good research"],
weaknesses=["Needs more detail"],
suggestions=["Expand section 2"],
raw_response="test response",
)
d = result.to_dict()
assert d["scores"]["factual_accuracy"] == 8
assert d["overall_score"] == 7.5
assert d["weighted_score"] == 7.6
assert "Good research" in d["strengths"]
# raw_response should not be in dict
assert "raw_response" not in d

View File

@@ -0,0 +1,207 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""Unit tests for report evaluation metrics."""
from src.eval.metrics import (
compute_metrics,
count_citations,
count_images,
count_words,
detect_sections,
extract_domains,
get_word_count_target,
)
class TestCountWords:
"""Tests for word counting function."""
def test_english_words(self):
text = "This is a simple test sentence."
assert count_words(text) == 6
def test_chinese_characters(self):
text = "这是一个测试"
assert count_words(text) == 6
def test_mixed_content(self):
text = "Hello 你好 World 世界"
assert count_words(text) == 4 + 2 # 2 English + 4 Chinese
def test_empty_string(self):
assert count_words("") == 0
class TestCountCitations:
"""Tests for citation counting function."""
def test_markdown_citations(self):
text = """
Check out [Google](https://google.com) and [GitHub](https://github.com).
"""
assert count_citations(text) == 2
def test_no_citations(self):
text = "This is plain text without any links."
assert count_citations(text) == 0
def test_invalid_urls(self):
text = "[Link](not-a-url) [Another](ftp://ftp.example.com)"
assert count_citations(text) == 0
def test_complex_urls(self):
text = "[Article](https://example.com/path/to/article?id=123&ref=test)"
assert count_citations(text) == 1
class TestExtractDomains:
"""Tests for domain extraction function."""
def test_extract_multiple_domains(self):
text = """
https://google.com/search
https://www.github.com/user/repo
https://docs.python.org/3/
"""
domains = extract_domains(text)
assert len(domains) == 3
assert "google.com" in domains
assert "github.com" in domains
assert "docs.python.org" in domains
def test_deduplicate_domains(self):
text = """
https://example.com/page1
https://example.com/page2
https://www.example.com/page3
"""
domains = extract_domains(text)
assert len(domains) == 1
assert "example.com" in domains
def test_no_urls(self):
text = "Plain text without URLs"
assert extract_domains(text) == []
class TestCountImages:
"""Tests for image counting function."""
def test_markdown_images(self):
text = """
![Alt text](https://example.com/image1.png)
![](https://example.com/image2.jpg)
"""
assert count_images(text) == 2
def test_no_images(self):
text = "Text without images [link](url)"
assert count_images(text) == 0
class TestDetectSections:
"""Tests for section detection function."""
def test_detect_title(self):
text = "# My Report Title\n\nSome content here."
sections = detect_sections(text)
assert sections.get("title") is True
def test_detect_key_points(self):
text = "## Key Points\n- Point 1\n- Point 2"
sections = detect_sections(text)
assert sections.get("key_points") is True
def test_detect_chinese_sections(self):
text = """# 报告标题
## 要点
- 要点1
## 概述
这是概述内容
"""
sections = detect_sections(text)
assert sections.get("title") is True
assert sections.get("key_points") is True
assert sections.get("overview") is True
def test_detect_citations_section(self):
text = """
## Key Citations
- [Source 1](https://example.com)
"""
sections = detect_sections(text)
assert sections.get("key_citations") is True
class TestComputeMetrics:
"""Tests for the main compute_metrics function."""
def test_complete_report(self):
report = """
# Research Report Title
## Key Points
- Point 1
- Point 2
- Point 3
## Overview
This is an overview of the research topic.
## Detailed Analysis
Here is the detailed analysis with [source](https://example.com).
![Figure 1](https://example.com/image.png)
## Key Citations
- [Source 1](https://example.com)
- [Source 2](https://another.com)
"""
metrics = compute_metrics(report)
assert metrics.has_title is True
assert metrics.has_key_points is True
assert metrics.has_overview is True
assert metrics.has_citations_section is True
assert metrics.citation_count >= 2
assert metrics.image_count == 1
assert metrics.unique_sources >= 1
assert metrics.section_coverage_score > 0.5
def test_minimal_report(self):
report = "Just some text without structure."
metrics = compute_metrics(report)
assert metrics.has_title is False
assert metrics.citation_count == 0
assert metrics.section_coverage_score < 0.5
def test_metrics_to_dict(self):
report = "# Title\n\nSome content"
metrics = compute_metrics(report)
result = metrics.to_dict()
assert isinstance(result, dict)
assert "word_count" in result
assert "citation_count" in result
assert "section_coverage_score" in result
class TestGetWordCountTarget:
"""Tests for word count target function."""
def test_strategic_investment_target(self):
target = get_word_count_target("strategic_investment")
assert target["min"] == 10000
assert target["max"] == 15000
def test_news_target(self):
target = get_word_count_target("news")
assert target["min"] == 800
assert target["max"] == 2000
def test_default_target(self):
target = get_word_count_target("unknown_style")
assert target["min"] == 1000
assert target["max"] == 5000

View File

@@ -150,6 +150,7 @@
"downloadWord": "Word (.docx)",
"downloadImage": "Image (.png)",
"exportFailed": "Export failed, please try again",
"evaluateReport": "Evaluate report quality",
"searchingFor": "Searching for",
"reading": "Reading",
"runningPythonCode": "Running Python code",
@@ -163,6 +164,31 @@
"errorGeneratingPodcast": "Error when generating podcast. Please try again.",
"downloadPodcast": "Download podcast"
},
"evaluation": {
"title": "Report Quality Evaluation",
"description": "Evaluate your report using automated metrics and AI analysis.",
"evaluating": "Evaluating report...",
"analyzing": "Running deep analysis...",
"overallScore": "Overall Score",
"metrics": "Report Metrics",
"wordCount": "Word Count",
"citations": "Citations",
"sources": "Unique Sources",
"images": "Images",
"sectionCoverage": "Section Coverage",
"detailedAnalysis": "Detailed Analysis",
"deepEvaluation": "Deep Evaluation (AI)",
"strengths": "Strengths",
"weaknesses": "Areas for Improvement",
"scores": {
"factual_accuracy": "Factual Accuracy",
"completeness": "Completeness",
"coherence": "Coherence",
"relevance": "Relevance",
"citation_quality": "Citation Quality",
"writing_quality": "Writing Quality"
}
},
"messages": {
"replaying": "Replaying",
"replayDescription": "DeerFlow is now replaying the conversation...",

View File

@@ -150,6 +150,7 @@
"downloadWord": "Word (.docx)",
"downloadImage": "图片 (.png)",
"exportFailed": "导出失败,请重试",
"evaluateReport": "评估报告质量",
"searchingFor": "搜索",
"reading": "阅读中",
"runningPythonCode": "运行 Python 代码",
@@ -163,6 +164,31 @@
"errorGeneratingPodcast": "生成播客时出错。请重试。",
"downloadPodcast": "下载播客"
},
"evaluation": {
"title": "报告质量评估",
"description": "使用自动化指标和 AI 分析评估您的报告。",
"evaluating": "正在评估报告...",
"analyzing": "正在进行深度分析...",
"overallScore": "总体评分",
"metrics": "报告指标",
"wordCount": "字数",
"citations": "引用数",
"sources": "独立来源",
"images": "图片数",
"sectionCoverage": "章节覆盖率",
"detailedAnalysis": "详细分析",
"deepEvaluation": "深度评估 (AI)",
"strengths": "优势",
"weaknesses": "改进建议",
"scores": {
"factual_accuracy": "事实准确性",
"completeness": "完整性",
"coherence": "连贯性",
"relevance": "相关性",
"citation_quality": "引用质量",
"writing_quality": "写作质量"
}
},
"messages": {
"replaying": "回放中",
"replayDescription": "DeerFlow 正在回放对话...",

View File

@@ -0,0 +1,300 @@
// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
// SPDX-License-Identifier: MIT
import {
BookOpen,
FileText,
Image,
Link2,
Loader2,
Sparkles,
ThumbsDown,
ThumbsUp,
} from "lucide-react";
import { useTranslations } from "next-intl";
import { useCallback, useEffect, useRef, useState } from "react";
import { Button } from "~/components/ui/button";
import {
Dialog,
DialogContent,
DialogDescription,
DialogHeader,
DialogTitle,
} from "~/components/ui/dialog";
import { Progress } from "~/components/ui/progress";
import { evaluateReport, type EvaluationResult } from "~/core/api";
import { cn } from "~/lib/utils";
interface EvaluationDialogProps {
open: boolean;
onOpenChange: (open: boolean) => void;
reportContent: string;
query: string;
reportStyle?: string;
}
function GradeBadge({ grade }: { grade: string }) {
const gradeColors: Record<string, string> = {
"A+": "bg-emerald-500",
A: "bg-emerald-500",
"A-": "bg-emerald-400",
"B+": "bg-blue-500",
B: "bg-blue-500",
"B-": "bg-blue-400",
"C+": "bg-yellow-500",
C: "bg-yellow-500",
"C-": "bg-yellow-400",
D: "bg-orange-500",
F: "bg-red-500",
};
return (
<div
aria-label={`Report grade: ${grade}`}
className={cn(
"flex h-16 w-16 items-center justify-center rounded-full text-2xl font-bold text-white",
gradeColors[grade] ?? "bg-gray-500",
)}
>
{grade}
</div>
);
}
function MetricItem({
icon: Icon,
label,
value,
suffix,
}: {
icon: React.ComponentType<{ className?: string }>;
label: string;
value: number | string;
suffix?: string;
}) {
return (
<div className="flex items-center gap-3">
<Icon className="text-muted-foreground h-4 w-4" />
<span className="text-muted-foreground text-sm">{label}</span>
<span className="ml-auto font-medium">
{value}
{suffix}
</span>
</div>
);
}
export function EvaluationDialog({
open,
onOpenChange,
reportContent,
query,
reportStyle,
}: EvaluationDialogProps) {
const t = useTranslations("chat.evaluation");
const [loading, setLoading] = useState(false);
const [deepLoading, setDeepLoading] = useState(false);
const [result, setResult] = useState<EvaluationResult | null>(null);
const [error, setError] = useState<string | null>(null);
const hasRunInitialEvaluation = useRef(false);
const runEvaluation = useCallback(
async (useLlm: boolean) => {
if (useLlm) {
setDeepLoading(true);
} else {
setLoading(true);
}
setError(null);
try {
const evalResult = await evaluateReport(
reportContent,
query,
reportStyle,
useLlm,
);
setResult(evalResult);
} catch (err) {
setError(err instanceof Error ? err.message : "Evaluation failed");
} finally {
setLoading(false);
setDeepLoading(false);
}
},
[reportContent, query, reportStyle],
);
useEffect(() => {
if (open && !hasRunInitialEvaluation.current) {
hasRunInitialEvaluation.current = true;
void runEvaluation(false);
}
}, [open, runEvaluation]);
useEffect(() => {
if (!open) {
setResult(null);
setError(null);
hasRunInitialEvaluation.current = false;
}
}, [open]);
return (
<Dialog open={open} onOpenChange={onOpenChange}>
<DialogContent className="sm:max-w-md">
<DialogHeader>
<DialogTitle>{t("title")}</DialogTitle>
<DialogDescription>{t("description")}</DialogDescription>
</DialogHeader>
{loading && !result ? (
<div className="flex flex-col items-center justify-center py-8">
<Loader2 className="h-8 w-8 animate-spin text-blue-500" />
<p className="text-muted-foreground mt-4 text-sm">
{t("evaluating")}
</p>
</div>
) : error ? (
<div className="py-4 text-center text-red-500">{error}</div>
) : result ? (
<div className="space-y-6">
{/* Grade and Score */}
<div className="flex items-center gap-6">
<GradeBadge grade={result.grade} />
<div>
<div className="text-3xl font-bold">{result.score}/10</div>
<div className="text-muted-foreground text-sm">
{t("overallScore")}
</div>
</div>
</div>
{/* Metrics */}
<div className="space-y-3">
<h4 className="text-sm font-medium">{t("metrics")}</h4>
<div className="bg-muted/50 space-y-2 rounded-lg p-3">
<MetricItem
icon={FileText}
label={t("wordCount")}
value={result.metrics.word_count.toLocaleString()}
/>
<MetricItem
icon={Link2}
label={t("citations")}
value={result.metrics.citation_count}
/>
<MetricItem
icon={BookOpen}
label={t("sources")}
value={result.metrics.unique_sources}
/>
<MetricItem
icon={Image}
label={t("images")}
value={result.metrics.image_count}
/>
<div className="pt-2">
<div className="mb-1 flex items-center justify-between text-sm">
<span className="text-muted-foreground">
{t("sectionCoverage")}
</span>
<span className="font-medium">
{Math.round(result.metrics.section_coverage_score * 100)}%
</span>
</div>
<Progress
value={result.metrics.section_coverage_score * 100}
className="h-2"
/>
</div>
</div>
</div>
{/* LLM Evaluation Results */}
{result.llm_evaluation && (
<div className="space-y-3">
<h4 className="text-sm font-medium">{t("detailedAnalysis")}</h4>
{/* LLM Scores */}
<div className="bg-muted/50 grid grid-cols-2 gap-2 rounded-lg p-3 text-sm">
{Object.entries(result.llm_evaluation.scores).map(
([key, value]) => (
<div key={key} className="flex justify-between">
<span className="text-muted-foreground">
{t(`scores.${key}`)}
</span>
<span className="font-medium">{value}/10</span>
</div>
),
)}
</div>
{/* Strengths */}
{result.llm_evaluation.strengths.length > 0 && (
<div className="space-y-2">
<div className="flex items-center gap-2 text-sm font-medium text-emerald-600">
<ThumbsUp className="h-4 w-4" />
{t("strengths")}
</div>
<ul className="space-y-1 text-sm">
{result.llm_evaluation.strengths
.slice(0, 3)
.map((s, i) => (
<li key={i} className="text-muted-foreground">
{s}
</li>
))}
</ul>
</div>
)}
{/* Weaknesses */}
{result.llm_evaluation.weaknesses.length > 0 && (
<div className="space-y-2">
<div className="flex items-center gap-2 text-sm font-medium text-orange-600">
<ThumbsDown className="h-4 w-4" />
{t("weaknesses")}
</div>
<ul className="space-y-1 text-sm">
{result.llm_evaluation.weaknesses
.slice(0, 3)
.map((w, i) => (
<li key={i} className="text-muted-foreground">
{w}
</li>
))}
</ul>
</div>
)}
</div>
)}
{/* Deep Evaluation Button */}
{!result.llm_evaluation && (
<Button
variant="outline"
className="w-full"
onClick={() => runEvaluation(true)}
disabled={deepLoading}
>
{deepLoading ? (
<>
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
{t("analyzing")}
</>
) : (
<>
<Sparkles className="mr-2 h-4 w-4" />
{t("deepEvaluation")}
</>
)}
</Button>
)}
</div>
) : null}
</DialogContent>
</Dialog>
);
}

View File

@@ -16,6 +16,7 @@ import { jsPDF } from "jspdf";
import {
Check,
Copy,
GraduationCap,
Headphones,
Pencil,
Undo2,
@@ -43,9 +44,10 @@ import {
} from "~/components/ui/dropdown-menu";
import { Tabs, TabsContent, TabsList, TabsTrigger } from "~/components/ui/tabs";
import { useReplay } from "~/core/replay";
import { closeResearch, listenToPodcast, useStore } from "~/core/store";
import { closeResearch, getResearchQuery, listenToPodcast, useStore, useSettingsStore } from "~/core/store";
import { cn } from "~/lib/utils";
import { EvaluationDialog } from "./evaluation-dialog";
import { ResearchActivitiesBlock } from "./research-activities-block";
import { ResearchReportBlock } from "./research-report-block";
@@ -84,6 +86,7 @@ export function ResearchBlock({
const [editing, setEditing] = useState(false);
const [isDownloading, setIsDownloading] = useState(false);
const [copied, setCopied] = useState(false);
const [showEvaluation, setShowEvaluation] = useState(false);
const handleCopy = useCallback(() => {
if (!reportId) {
return;
@@ -676,6 +679,16 @@ ${htmlContent}
{copied ? <Check /> : <Copy />}
</Button>
</Tooltip>
<Tooltip title={t("evaluateReport")}>
<Button
className="text-gray-400"
size="icon"
variant="ghost"
onClick={() => setShowEvaluation(true)}
>
<GraduationCap />
</Button>
</Tooltip>
<DropdownMenu>
<Tooltip title={t("downloadReport")}>
<DropdownMenuTrigger asChild>
@@ -796,6 +809,19 @@ ${htmlContent}
</TabsContent>
</Tabs>
</Card>
{/* Evaluation Dialog */}
{reportId && researchId && (
<EvaluationDialog
open={showEvaluation}
onOpenChange={setShowEvaluation}
reportContent={
useStore.getState().messages.get(reportId)?.content ?? ""
}
query={getResearchQuery(researchId)}
reportStyle={useSettingsStore.getState().general.reportStyle.toLowerCase()}
/>
)}
</div>
);
}

View File

@@ -0,0 +1,30 @@
"use client"
import * as React from "react"
import { cn } from "~/lib/utils"
interface ProgressProps extends React.HTMLAttributes<HTMLDivElement> {
value?: number
}
function Progress({ className, value = 0, ...props }: ProgressProps) {
return (
<div
data-slot="progress"
className={cn(
"bg-primary/20 relative h-2 w-full overflow-hidden rounded-full",
className
)}
{...props}
>
<div
data-slot="progress-indicator"
className="bg-primary h-full transition-all duration-300 ease-in-out"
style={{ width: `${Math.min(100, Math.max(0, value))}%` }}
/>
</div>
)
}
export { Progress }

View File

@@ -0,0 +1,91 @@
// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
// SPDX-License-Identifier: MIT
import { resolveServiceURL } from "./resolve-service-url";
/**
* Report evaluation API client.
*/
export interface EvaluationMetrics {
word_count: number;
citation_count: number;
unique_sources: number;
image_count: number;
section_count: number;
section_coverage_score: number;
sections_found: string[];
sections_missing: string[];
has_title: boolean;
has_key_points: boolean;
has_overview: boolean;
has_citations_section: boolean;
}
export interface LLMEvaluationScores {
factual_accuracy: number;
completeness: number;
coherence: number;
relevance: number;
citation_quality: number;
writing_quality: number;
}
export interface LLMEvaluation {
scores: LLMEvaluationScores;
overall_score: number;
weighted_score: number;
strengths: string[];
weaknesses: string[];
suggestions: string[];
}
export interface EvaluationResult {
metrics: EvaluationMetrics;
score: number;
grade: string;
llm_evaluation?: LLMEvaluation;
summary?: string;
}
export interface EvaluateReportRequest {
content: string;
query: string;
report_style?: string;
use_llm?: boolean;
}
/**
* Evaluate a report's quality using automated metrics and optionally LLM-as-Judge.
*
* @param content - Report markdown content
* @param query - Original research query
* @param reportStyle - Report style (academic, news, etc.)
* @param useLlm - Whether to use LLM for deep evaluation
* @returns Evaluation result with metrics, score, and grade
*/
export async function evaluateReport(
content: string,
query: string,
reportStyle?: string,
useLlm?: boolean,
): Promise<EvaluationResult> {
const response = await fetch(resolveServiceURL("report/evaluate"), {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
content,
query,
report_style: reportStyle ?? "default",
use_llm: useLlm ?? false,
} satisfies EvaluateReportRequest),
});
if (!response.ok) {
throw new Error(`Evaluation failed: ${response.statusText}`);
}
return response.json();
}

View File

@@ -2,6 +2,7 @@
// SPDX-License-Identifier: MIT
export * from "./chat";
export * from "./evaluate";
export * from "./mcp";
export * from "./podcast";
export * from "./prompt-enhancer";

View File

@@ -24,6 +24,7 @@ export const useStore = create<{
researchPlanIds: Map<string, string>;
researchReportIds: Map<string, string>;
researchActivityIds: Map<string, string[]>;
researchQueries: Map<string, string>;
ongoingResearchId: string | null;
openResearchId: string | null;
@@ -42,6 +43,7 @@ export const useStore = create<{
researchPlanIds: new Map<string, string>(),
researchReportIds: new Map<string, string>(),
researchActivityIds: new Map<string, string[]>(),
researchQueries: new Map<string, string>(),
ongoingResearchId: null,
openResearchId: null,
@@ -267,11 +269,17 @@ function getOngoingResearchId() {
function appendResearch(researchId: string) {
let planMessage: Message | undefined;
let userQuery: string | undefined;
const reversedMessageIds = [...useStore.getState().messageIds].reverse();
for (const messageId of reversedMessageIds) {
const message = getMessage(messageId);
if (message?.agent === "planner") {
if (!planMessage && message?.agent === "planner") {
planMessage = message;
}
if (!userQuery && message?.role === "user") {
userQuery = message.content;
}
if (planMessage && userQuery) {
break;
}
}
@@ -288,6 +296,10 @@ function appendResearch(researchId: string) {
researchId,
messageIds,
),
researchQueries: new Map(useStore.getState().researchQueries).set(
researchId,
userQuery ?? "",
),
});
}
@@ -394,6 +406,10 @@ export function useResearchMessage(researchId: string) {
);
}
export function getResearchQuery(researchId: string): string {
return useStore.getState().researchQueries.get(researchId) ?? "";
}
export function useMessage(messageId: string | null | undefined) {
return useStore(
useShallow((state) =>