mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-03 06:12:14 +08:00
feat(eval): add report quality evaluation module and UI integration (#776)
* feat(eval): add report quality evaluation module Addresses issue #773 - How to evaluate generated report quality objectively. This module provides two evaluation approaches: 1. Automated metrics (no LLM required): - Citation count and source diversity - Word count compliance per report style - Section structure validation - Image inclusion tracking 2. LLM-as-Judge evaluation: - Factual accuracy scoring - Completeness assessment - Coherence evaluation - Relevance and citation quality checks The combined evaluator provides a final score (1-10) and letter grade (A+ to F). Files added: - src/eval/__init__.py - src/eval/metrics.py - src/eval/llm_judge.py - src/eval/evaluator.py - tests/unit/eval/test_metrics.py - tests/unit/eval/test_evaluator.py * feat(eval): integrate report evaluation with web UI This commit adds the web UI integration for the evaluation module: Backend: - Add EvaluateReportRequest/Response models in src/server/eval_request.py - Add /api/report/evaluate endpoint to src/server/app.py Frontend: - Add evaluateReport API function in web/src/core/api/evaluate.ts - Create EvaluationDialog component with grade badge, metrics display, and optional LLM deep evaluation - Add evaluation button (graduation cap icon) to research-block.tsx toolbar - Add i18n translations for English and Chinese The evaluation UI allows users to: 1. View quick metrics-only evaluation (instant) 2. Optionally run deep LLM-based evaluation for detailed analysis 3. See grade (A+ to F), score (1-10), and metric breakdown * feat(eval): improve evaluation reliability and add LLM judge tests - Extract MAX_REPORT_LENGTH constant in llm_judge.py for maintainability - Add comprehensive unit tests for LLMJudge class (parse_response, calculate_weighted_score, evaluate with mocked LLM) - Pass reportStyle prop to EvaluationDialog for accurate evaluation criteria - Add researchQueries store map to reliably associate queries with research - Add getResearchQuery helper to retrieve query by researchId - Remove unused imports in test_metrics.py * fix(eval): use resolveServiceURL for evaluate API endpoint The evaluateReport function was using a relative URL '/api/report/evaluate' which sent requests to the Next.js server instead of the FastAPI backend. Changed to use resolveServiceURL() consistent with other API functions. * fix: improve type accuracy and React hooks in evaluation components - Fix get_word_count_target return type from Optional[Dict] to Dict since it always returns a value via default fallback - Fix useEffect dependency issue in EvaluationDialog using useRef to prevent unwanted re-evaluations - Add aria-label to GradeBadge for screen reader accessibility
This commit is contained in:
21
src/eval/__init__.py
Normal file
21
src/eval/__init__.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""
|
||||
Report Quality Evaluation Module for DeerFlow.
|
||||
|
||||
This module provides objective methods to evaluate generated report quality,
|
||||
including automated metrics and LLM-based evaluation.
|
||||
"""
|
||||
|
||||
from .evaluator import ReportEvaluator
|
||||
from .metrics import ReportMetrics, compute_metrics
|
||||
from .llm_judge import LLMJudge, evaluate_with_llm
|
||||
|
||||
__all__ = [
|
||||
"ReportEvaluator",
|
||||
"ReportMetrics",
|
||||
"compute_metrics",
|
||||
"LLMJudge",
|
||||
"evaluate_with_llm",
|
||||
]
|
||||
249
src/eval/evaluator.py
Normal file
249
src/eval/evaluator.py
Normal file
@@ -0,0 +1,249 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""
|
||||
Combined report evaluator orchestrating both automated metrics and LLM evaluation.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from .llm_judge import EvaluationResult, LLMJudge
|
||||
from .metrics import ReportMetrics, compute_metrics, get_word_count_target
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CombinedEvaluation:
|
||||
"""Combined evaluation results from metrics and LLM judge."""
|
||||
|
||||
metrics: ReportMetrics
|
||||
llm_evaluation: Optional[EvaluationResult]
|
||||
final_score: float
|
||||
grade: str
|
||||
summary: str
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary format."""
|
||||
return {
|
||||
"metrics": self.metrics.to_dict(),
|
||||
"llm_evaluation": (
|
||||
self.llm_evaluation.to_dict() if self.llm_evaluation else None
|
||||
),
|
||||
"final_score": self.final_score,
|
||||
"grade": self.grade,
|
||||
"summary": self.summary,
|
||||
}
|
||||
|
||||
|
||||
def score_to_grade(score: float) -> str:
|
||||
"""Convert numeric score to letter grade."""
|
||||
if score >= 9.0:
|
||||
return "A+"
|
||||
elif score >= 8.5:
|
||||
return "A"
|
||||
elif score >= 8.0:
|
||||
return "A-"
|
||||
elif score >= 7.5:
|
||||
return "B+"
|
||||
elif score >= 7.0:
|
||||
return "B"
|
||||
elif score >= 6.5:
|
||||
return "B-"
|
||||
elif score >= 6.0:
|
||||
return "C+"
|
||||
elif score >= 5.5:
|
||||
return "C"
|
||||
elif score >= 5.0:
|
||||
return "C-"
|
||||
elif score >= 4.0:
|
||||
return "D"
|
||||
else:
|
||||
return "F"
|
||||
|
||||
|
||||
class ReportEvaluator:
|
||||
"""
|
||||
Combined report evaluator using both automated metrics and LLM-as-Judge.
|
||||
|
||||
This evaluator provides comprehensive report quality assessment by:
|
||||
1. Computing automated metrics (fast, deterministic)
|
||||
2. Running LLM-based evaluation (nuanced, contextual)
|
||||
3. Combining both for a final score and grade
|
||||
"""
|
||||
|
||||
def __init__(self, llm: Any = None, use_llm: bool = True):
|
||||
"""
|
||||
Initialize the evaluator.
|
||||
|
||||
Args:
|
||||
llm: Optional LLM instance for LLM-as-Judge evaluation
|
||||
use_llm: Whether to use LLM evaluation (can be disabled for speed)
|
||||
"""
|
||||
self.use_llm = use_llm
|
||||
self.llm_judge = LLMJudge(llm=llm) if use_llm else None
|
||||
|
||||
def _compute_metrics_score(
|
||||
self, metrics: ReportMetrics, report_style: str
|
||||
) -> float:
|
||||
"""
|
||||
Convert automated metrics to a 0-10 score.
|
||||
|
||||
Scoring breakdown:
|
||||
- Section coverage: 30%
|
||||
- Citation quality: 25%
|
||||
- Word count compliance: 20%
|
||||
- Source diversity: 15%
|
||||
- Image inclusion: 10%
|
||||
"""
|
||||
score = 0.0
|
||||
|
||||
section_score = metrics.section_coverage_score * 10
|
||||
score += section_score * 0.30
|
||||
|
||||
citation_score = min(metrics.citation_count / 10, 1.0) * 10
|
||||
score += citation_score * 0.25
|
||||
|
||||
target = get_word_count_target(report_style)
|
||||
if target:
|
||||
if target["min"] <= metrics.word_count <= target["max"]:
|
||||
word_score = 10.0
|
||||
elif metrics.word_count < target["min"]:
|
||||
word_score = (metrics.word_count / target["min"]) * 8
|
||||
else:
|
||||
excess_ratio = metrics.word_count / target["max"]
|
||||
word_score = max(10 - (excess_ratio - 1) * 5, 5)
|
||||
score += word_score * 0.20
|
||||
|
||||
diversity_score = min(metrics.unique_sources / 5, 1.0) * 10
|
||||
score += diversity_score * 0.15
|
||||
|
||||
image_score = min(metrics.image_count / 3, 1.0) * 10
|
||||
score += image_score * 0.10
|
||||
|
||||
return round(score, 2)
|
||||
|
||||
def _generate_summary(
|
||||
self,
|
||||
metrics: ReportMetrics,
|
||||
llm_eval: Optional[EvaluationResult],
|
||||
final_score: float,
|
||||
grade: str,
|
||||
) -> str:
|
||||
"""Generate a human-readable evaluation summary."""
|
||||
lines = [f"Report Grade: {grade} ({final_score}/10)", ""]
|
||||
|
||||
lines.append("**Automated Metrics:**")
|
||||
lines.append(f"- Word Count: {metrics.word_count}")
|
||||
lines.append(f"- Citations: {metrics.citation_count}")
|
||||
lines.append(f"- Unique Sources: {metrics.unique_sources}")
|
||||
lines.append(f"- Images: {metrics.image_count}")
|
||||
lines.append(
|
||||
f"- Section Coverage: {metrics.section_coverage_score * 100:.0f}%"
|
||||
)
|
||||
|
||||
if metrics.sections_missing:
|
||||
lines.append(f"- Missing Sections: {', '.join(metrics.sections_missing)}")
|
||||
|
||||
if llm_eval:
|
||||
lines.append("")
|
||||
lines.append("**LLM Evaluation:**")
|
||||
for criterion, score in llm_eval.scores.items():
|
||||
lines.append(f"- {criterion.replace('_', ' ').title()}: {score}/10")
|
||||
|
||||
if llm_eval.strengths:
|
||||
lines.append("")
|
||||
lines.append("**Strengths:**")
|
||||
for strength in llm_eval.strengths[:3]:
|
||||
lines.append(f"- {strength}")
|
||||
|
||||
if llm_eval.weaknesses:
|
||||
lines.append("")
|
||||
lines.append("**Areas for Improvement:**")
|
||||
for weakness in llm_eval.weaknesses[:3]:
|
||||
lines.append(f"- {weakness}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
async def evaluate(
|
||||
self,
|
||||
report: str,
|
||||
query: str,
|
||||
report_style: str = "default",
|
||||
) -> CombinedEvaluation:
|
||||
"""
|
||||
Evaluate a report using both metrics and LLM.
|
||||
|
||||
Args:
|
||||
report: The report text to evaluate
|
||||
query: The original research query
|
||||
report_style: The style of report
|
||||
|
||||
Returns:
|
||||
CombinedEvaluation with full results
|
||||
"""
|
||||
metrics = compute_metrics(report, report_style)
|
||||
metrics_score = self._compute_metrics_score(metrics, report_style)
|
||||
|
||||
llm_eval = None
|
||||
if self.use_llm and self.llm_judge:
|
||||
try:
|
||||
llm_eval = await self.llm_judge.evaluate(report, query, report_style)
|
||||
except Exception as e:
|
||||
logger.warning(f"LLM evaluation failed, using metrics only: {e}")
|
||||
|
||||
if llm_eval and llm_eval.overall_score > 0:
|
||||
final_score = (metrics_score * 0.4) + (llm_eval.weighted_score * 0.6)
|
||||
else:
|
||||
final_score = metrics_score
|
||||
|
||||
final_score = round(final_score, 2)
|
||||
grade = score_to_grade(final_score)
|
||||
|
||||
summary = self._generate_summary(metrics, llm_eval, final_score, grade)
|
||||
|
||||
return CombinedEvaluation(
|
||||
metrics=metrics,
|
||||
llm_evaluation=llm_eval,
|
||||
final_score=final_score,
|
||||
grade=grade,
|
||||
summary=summary,
|
||||
)
|
||||
|
||||
def evaluate_sync(
|
||||
self,
|
||||
report: str,
|
||||
query: str,
|
||||
report_style: str = "default",
|
||||
) -> CombinedEvaluation:
|
||||
"""Synchronous version of evaluate."""
|
||||
import asyncio
|
||||
|
||||
return asyncio.run(self.evaluate(report, query, report_style))
|
||||
|
||||
def evaluate_metrics_only(
|
||||
self,
|
||||
report: str,
|
||||
report_style: str = "default",
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Quick evaluation using only automated metrics (no LLM).
|
||||
|
||||
Args:
|
||||
report: The report text to evaluate
|
||||
report_style: The style of report
|
||||
|
||||
Returns:
|
||||
Dictionary with metrics and score
|
||||
"""
|
||||
metrics = compute_metrics(report, report_style)
|
||||
metrics_score = self._compute_metrics_score(metrics, report_style)
|
||||
grade = score_to_grade(metrics_score)
|
||||
|
||||
return {
|
||||
"metrics": metrics.to_dict(),
|
||||
"score": metrics_score,
|
||||
"grade": grade,
|
||||
}
|
||||
282
src/eval/llm_judge.py
Normal file
282
src/eval/llm_judge.py
Normal file
@@ -0,0 +1,282 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""
|
||||
LLM-as-Judge evaluation for report quality.
|
||||
|
||||
Uses an LLM to evaluate reports on multiple quality dimensions,
|
||||
providing more nuanced assessment than automated metrics alone.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain_core.messages import HumanMessage, SystemMessage
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Maximum characters of report content to send to the LLM for evaluation.
|
||||
# This limit prevents exceeding LLM context windows and controls token usage.
|
||||
MAX_REPORT_LENGTH = 15000
|
||||
|
||||
EVALUATION_CRITERIA = {
|
||||
"factual_accuracy": {
|
||||
"description": "Are claims supported by cited sources? Is information accurate and verifiable?",
|
||||
"weight": 0.25,
|
||||
},
|
||||
"completeness": {
|
||||
"description": "Does the report comprehensively cover all aspects of the topic?",
|
||||
"weight": 0.20,
|
||||
},
|
||||
"coherence": {
|
||||
"description": "Is the report logically structured, well-organized, and easy to follow?",
|
||||
"weight": 0.20,
|
||||
},
|
||||
"relevance": {
|
||||
"description": "Does the content directly address the research question without unnecessary tangents?",
|
||||
"weight": 0.15,
|
||||
},
|
||||
"citation_quality": {
|
||||
"description": "Are sources credible, diverse, and properly cited?",
|
||||
"weight": 0.10,
|
||||
},
|
||||
"writing_quality": {
|
||||
"description": "Is the writing clear, professional, and appropriate for the target audience?",
|
||||
"weight": 0.10,
|
||||
},
|
||||
}
|
||||
|
||||
JUDGE_SYSTEM_PROMPT = """You are an expert report quality evaluator. Your task is to objectively assess the quality of research reports.
|
||||
|
||||
Evaluate the report on the following criteria, scoring each from 1-10:
|
||||
|
||||
1. **Factual Accuracy** (1-10): Are claims supported by cited sources? Is information accurate?
|
||||
2. **Completeness** (1-10): Does the report cover all aspects of the topic comprehensively?
|
||||
3. **Coherence** (1-10): Is the report logically structured and easy to follow?
|
||||
4. **Relevance** (1-10): Does content directly address the research question?
|
||||
5. **Citation Quality** (1-10): Are sources credible, diverse, and properly cited?
|
||||
6. **Writing Quality** (1-10): Is the writing clear and appropriate for the audience?
|
||||
|
||||
Respond ONLY with a valid JSON object in this exact format:
|
||||
{
|
||||
"scores": {
|
||||
"factual_accuracy": <1-10>,
|
||||
"completeness": <1-10>,
|
||||
"coherence": <1-10>,
|
||||
"relevance": <1-10>,
|
||||
"citation_quality": <1-10>,
|
||||
"writing_quality": <1-10>
|
||||
},
|
||||
"overall_score": <1-10>,
|
||||
"strengths": ["strength1", "strength2"],
|
||||
"weaknesses": ["weakness1", "weakness2"],
|
||||
"suggestions": ["suggestion1", "suggestion2"]
|
||||
}
|
||||
|
||||
Be objective and thorough in your evaluation."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvaluationResult:
|
||||
"""Container for LLM evaluation results."""
|
||||
|
||||
scores: Dict[str, int]
|
||||
overall_score: float
|
||||
weighted_score: float
|
||||
strengths: List[str]
|
||||
weaknesses: List[str]
|
||||
suggestions: List[str]
|
||||
raw_response: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert evaluation result to dictionary."""
|
||||
return {
|
||||
"scores": self.scores,
|
||||
"overall_score": self.overall_score,
|
||||
"weighted_score": self.weighted_score,
|
||||
"strengths": self.strengths,
|
||||
"weaknesses": self.weaknesses,
|
||||
"suggestions": self.suggestions,
|
||||
}
|
||||
|
||||
|
||||
class LLMJudge:
|
||||
"""LLM-based report quality evaluator."""
|
||||
|
||||
def __init__(self, llm: Any = None):
|
||||
"""
|
||||
Initialize the LLM Judge.
|
||||
|
||||
Args:
|
||||
llm: LangChain-compatible LLM instance. If None, will be created on demand.
|
||||
"""
|
||||
self._llm = llm
|
||||
|
||||
def _get_llm(self):
|
||||
"""Get or create the LLM instance."""
|
||||
if self._llm is None:
|
||||
from src.llms.llm import get_llm_by_type
|
||||
|
||||
self._llm = get_llm_by_type("basic")
|
||||
return self._llm
|
||||
|
||||
def _calculate_weighted_score(self, scores: Dict[str, int]) -> float:
|
||||
"""Calculate weighted average score based on criteria weights."""
|
||||
total_weight = 0
|
||||
weighted_sum = 0
|
||||
|
||||
for criterion, score in scores.items():
|
||||
if criterion in EVALUATION_CRITERIA:
|
||||
weight = EVALUATION_CRITERIA[criterion]["weight"]
|
||||
weighted_sum += score * weight
|
||||
total_weight += weight
|
||||
|
||||
if total_weight > 0:
|
||||
return round(weighted_sum / total_weight, 2)
|
||||
return 0.0
|
||||
|
||||
def _parse_response(self, response: str) -> Dict[str, Any]:
|
||||
"""Parse LLM response into structured format."""
|
||||
try:
|
||||
json_match = response
|
||||
if "```json" in response:
|
||||
json_match = response.split("```json")[1].split("```")[0]
|
||||
elif "```" in response:
|
||||
json_match = response.split("```")[1].split("```")[0]
|
||||
|
||||
return json.loads(json_match.strip())
|
||||
except (json.JSONDecodeError, IndexError) as e:
|
||||
logger.warning(f"Failed to parse LLM response: {e}")
|
||||
return {
|
||||
"scores": {
|
||||
"factual_accuracy": 5,
|
||||
"completeness": 5,
|
||||
"coherence": 5,
|
||||
"relevance": 5,
|
||||
"citation_quality": 5,
|
||||
"writing_quality": 5,
|
||||
},
|
||||
"overall_score": 5,
|
||||
"strengths": ["Unable to parse evaluation"],
|
||||
"weaknesses": ["Evaluation parsing failed"],
|
||||
"suggestions": ["Please re-run evaluation"],
|
||||
}
|
||||
|
||||
async def evaluate(
|
||||
self,
|
||||
report: str,
|
||||
query: str,
|
||||
report_style: str = "default",
|
||||
) -> EvaluationResult:
|
||||
"""
|
||||
Evaluate a report using LLM-as-Judge.
|
||||
|
||||
Args:
|
||||
report: The report text to evaluate
|
||||
query: The original research query
|
||||
report_style: The style of report for context
|
||||
|
||||
Returns:
|
||||
EvaluationResult with scores and feedback
|
||||
"""
|
||||
llm = self._get_llm()
|
||||
|
||||
user_prompt = f"""Please evaluate the following research report.
|
||||
|
||||
**Original Research Query:** {query}
|
||||
|
||||
**Report Style:** {report_style}
|
||||
|
||||
**Report to Evaluate:**
|
||||
{report[:MAX_REPORT_LENGTH]}
|
||||
|
||||
Provide your evaluation in the specified JSON format."""
|
||||
|
||||
messages = [
|
||||
SystemMessage(content=JUDGE_SYSTEM_PROMPT),
|
||||
HumanMessage(content=user_prompt),
|
||||
]
|
||||
|
||||
try:
|
||||
response = await llm.ainvoke(messages)
|
||||
response_text = (
|
||||
response.content if hasattr(response, "content") else str(response)
|
||||
)
|
||||
|
||||
parsed = self._parse_response(response_text)
|
||||
|
||||
scores = parsed.get("scores", {})
|
||||
weighted_score = self._calculate_weighted_score(scores)
|
||||
|
||||
return EvaluationResult(
|
||||
scores=scores,
|
||||
overall_score=parsed.get("overall_score", 5),
|
||||
weighted_score=weighted_score,
|
||||
strengths=parsed.get("strengths", []),
|
||||
weaknesses=parsed.get("weaknesses", []),
|
||||
suggestions=parsed.get("suggestions", []),
|
||||
raw_response=response_text,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LLM evaluation failed: {e}")
|
||||
return EvaluationResult(
|
||||
scores={
|
||||
"factual_accuracy": 0,
|
||||
"completeness": 0,
|
||||
"coherence": 0,
|
||||
"relevance": 0,
|
||||
"citation_quality": 0,
|
||||
"writing_quality": 0,
|
||||
},
|
||||
overall_score=0,
|
||||
weighted_score=0,
|
||||
strengths=[],
|
||||
weaknesses=[f"Evaluation failed: {str(e)}"],
|
||||
suggestions=["Please retry evaluation"],
|
||||
)
|
||||
|
||||
def evaluate_sync(
|
||||
self,
|
||||
report: str,
|
||||
query: str,
|
||||
report_style: str = "default",
|
||||
) -> EvaluationResult:
|
||||
"""
|
||||
Synchronous version of evaluate.
|
||||
|
||||
Args:
|
||||
report: The report text to evaluate
|
||||
query: The original research query
|
||||
report_style: The style of report for context
|
||||
|
||||
Returns:
|
||||
EvaluationResult with scores and feedback
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
return asyncio.run(self.evaluate(report, query, report_style))
|
||||
|
||||
|
||||
async def evaluate_with_llm(
|
||||
report: str,
|
||||
query: str,
|
||||
report_style: str = "default",
|
||||
llm: Any = None,
|
||||
) -> EvaluationResult:
|
||||
"""
|
||||
Convenience function to evaluate a report with LLM.
|
||||
|
||||
Args:
|
||||
report: The report text to evaluate
|
||||
query: The original research query
|
||||
report_style: The style of report for context
|
||||
llm: Optional LLM instance to use
|
||||
|
||||
Returns:
|
||||
EvaluationResult with scores and feedback
|
||||
"""
|
||||
judge = LLMJudge(llm=llm)
|
||||
return await judge.evaluate(report, query, report_style)
|
||||
229
src/eval/metrics.py
Normal file
229
src/eval/metrics.py
Normal file
@@ -0,0 +1,229 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""
|
||||
Automated metrics for report quality evaluation.
|
||||
|
||||
These metrics can be computed without LLM calls, providing fast and
|
||||
deterministic quality assessment.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReportMetrics:
|
||||
"""Container for computed report metrics."""
|
||||
|
||||
word_count: int = 0
|
||||
citation_count: int = 0
|
||||
unique_sources: int = 0
|
||||
image_count: int = 0
|
||||
section_count: int = 0
|
||||
sections_found: List[str] = field(default_factory=list)
|
||||
sections_missing: List[str] = field(default_factory=list)
|
||||
section_coverage_score: float = 0.0
|
||||
has_title: bool = False
|
||||
has_key_points: bool = False
|
||||
has_overview: bool = False
|
||||
has_citations_section: bool = False
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert metrics to dictionary."""
|
||||
return {
|
||||
"word_count": self.word_count,
|
||||
"citation_count": self.citation_count,
|
||||
"unique_sources": self.unique_sources,
|
||||
"image_count": self.image_count,
|
||||
"section_count": self.section_count,
|
||||
"sections_found": self.sections_found,
|
||||
"sections_missing": self.sections_missing,
|
||||
"section_coverage_score": self.section_coverage_score,
|
||||
"has_title": self.has_title,
|
||||
"has_key_points": self.has_key_points,
|
||||
"has_overview": self.has_overview,
|
||||
"has_citations_section": self.has_citations_section,
|
||||
}
|
||||
|
||||
|
||||
# Required sections for different report styles
|
||||
REPORT_STYLE_SECTIONS = {
|
||||
"default": [
|
||||
"title",
|
||||
"key_points",
|
||||
"overview",
|
||||
"detailed_analysis",
|
||||
"key_citations",
|
||||
],
|
||||
"academic": [
|
||||
"title",
|
||||
"key_points",
|
||||
"overview",
|
||||
"detailed_analysis",
|
||||
"literature_review",
|
||||
"methodology",
|
||||
"key_citations",
|
||||
],
|
||||
"news": [
|
||||
"title",
|
||||
"key_points",
|
||||
"overview",
|
||||
"detailed_analysis",
|
||||
"key_citations",
|
||||
],
|
||||
"popular_science": [
|
||||
"title",
|
||||
"key_points",
|
||||
"overview",
|
||||
"detailed_analysis",
|
||||
"key_citations",
|
||||
],
|
||||
"social_media": [
|
||||
"title",
|
||||
"key_points",
|
||||
"overview",
|
||||
"key_citations",
|
||||
],
|
||||
"strategic_investment": [
|
||||
"title",
|
||||
"key_points",
|
||||
"overview",
|
||||
"detailed_analysis",
|
||||
"executive_summary",
|
||||
"market_analysis",
|
||||
"technology_analysis",
|
||||
"investment_recommendations",
|
||||
"key_citations",
|
||||
],
|
||||
}
|
||||
|
||||
# Section name patterns for detection (supports both English and Chinese)
|
||||
SECTION_PATTERNS = {
|
||||
"title": r"^#\s+.+",
|
||||
"key_points": r"(?:key\s*points|要点|关键发现|核心观点)",
|
||||
"overview": r"(?:overview|概述|简介|背景)",
|
||||
"detailed_analysis": r"(?:detailed\s*analysis|详细分析|深度分析|分析)",
|
||||
"key_citations": r"(?:key\s*citations|references|参考文献|引用|来源)",
|
||||
"literature_review": r"(?:literature\s*review|文献综述|研究回顾)",
|
||||
"methodology": r"(?:methodology|方法论|研究方法)",
|
||||
"executive_summary": r"(?:executive\s*summary|执行摘要|投资建议)",
|
||||
"market_analysis": r"(?:market\s*analysis|市场分析|产业分析)",
|
||||
"technology_analysis": r"(?:technology|技术.*(?:分析|解析|深度))",
|
||||
"investment_recommendations": r"(?:investment.*recommend|投资建议|投资评级)",
|
||||
}
|
||||
|
||||
|
||||
def count_words(text: str) -> int:
|
||||
"""Count words in text, handling both English and Chinese."""
|
||||
english_words = len(re.findall(r"\b[a-zA-Z]+\b", text))
|
||||
chinese_chars = len(re.findall(r"[\u4e00-\u9fff]", text))
|
||||
return english_words + chinese_chars
|
||||
|
||||
|
||||
def count_citations(text: str) -> int:
|
||||
"""Count markdown-style citations [text](url)."""
|
||||
pattern = r"\[.+?\]\(https?://[^\s\)]+\)"
|
||||
return len(re.findall(pattern, text))
|
||||
|
||||
|
||||
def extract_domains(text: str) -> List[str]:
|
||||
"""Extract unique domains from URLs in the text."""
|
||||
url_pattern = r"https?://([^\s\)\]]+)"
|
||||
urls = re.findall(url_pattern, text)
|
||||
domains = set()
|
||||
for url in urls:
|
||||
try:
|
||||
parsed = urlparse(f"http://{url}")
|
||||
domain = parsed.netloc or url.split("/")[0]
|
||||
domain = domain.lower().replace("www.", "")
|
||||
if domain:
|
||||
domains.add(domain)
|
||||
except Exception:
|
||||
continue
|
||||
return list(domains)
|
||||
|
||||
|
||||
def count_images(text: str) -> int:
|
||||
"""Count markdown images ."""
|
||||
pattern = r"!\[.*?\]\(.+?\)"
|
||||
return len(re.findall(pattern, text))
|
||||
|
||||
|
||||
def detect_sections(text: str, report_style: str = "default") -> Dict[str, bool]:
|
||||
"""Detect which sections are present in the report."""
|
||||
required_sections = REPORT_STYLE_SECTIONS.get(
|
||||
report_style, REPORT_STYLE_SECTIONS["default"]
|
||||
)
|
||||
detected = {}
|
||||
|
||||
text_lower = text.lower()
|
||||
|
||||
for section in required_sections:
|
||||
pattern = SECTION_PATTERNS.get(section, section.replace("_", r"\s*"))
|
||||
if section == "title":
|
||||
detected[section] = bool(re.search(pattern, text, re.MULTILINE))
|
||||
else:
|
||||
detected[section] = bool(
|
||||
re.search(pattern, text_lower, re.IGNORECASE | re.MULTILINE)
|
||||
)
|
||||
|
||||
return detected
|
||||
|
||||
|
||||
def compute_metrics(
|
||||
report: str, report_style: str = "default", target_word_count: Optional[int] = None
|
||||
) -> ReportMetrics:
|
||||
"""
|
||||
Compute automated metrics for a report.
|
||||
|
||||
Args:
|
||||
report: The report text in markdown format
|
||||
report_style: The style of report (academic, news, etc.)
|
||||
target_word_count: Optional target word count for compliance check
|
||||
|
||||
Returns:
|
||||
ReportMetrics object with computed values
|
||||
"""
|
||||
metrics = ReportMetrics()
|
||||
|
||||
metrics.word_count = count_words(report)
|
||||
metrics.citation_count = count_citations(report)
|
||||
|
||||
domains = extract_domains(report)
|
||||
metrics.unique_sources = len(domains)
|
||||
|
||||
metrics.image_count = count_images(report)
|
||||
|
||||
sections_detected = detect_sections(report, report_style)
|
||||
metrics.sections_found = [s for s, found in sections_detected.items() if found]
|
||||
metrics.sections_missing = [
|
||||
s for s, found in sections_detected.items() if not found
|
||||
]
|
||||
metrics.section_count = len(metrics.sections_found)
|
||||
|
||||
total_sections = len(sections_detected)
|
||||
if total_sections > 0:
|
||||
metrics.section_coverage_score = len(metrics.sections_found) / total_sections
|
||||
|
||||
metrics.has_title = sections_detected.get("title", False)
|
||||
metrics.has_key_points = sections_detected.get("key_points", False)
|
||||
metrics.has_overview = sections_detected.get("overview", False)
|
||||
metrics.has_citations_section = sections_detected.get("key_citations", False)
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
def get_word_count_target(report_style: str) -> Dict[str, int]:
|
||||
"""Get target word count range for a report style."""
|
||||
targets = {
|
||||
"strategic_investment": {"min": 10000, "max": 15000},
|
||||
"academic": {"min": 3000, "max": 8000},
|
||||
"news": {"min": 800, "max": 2000},
|
||||
"popular_science": {"min": 1500, "max": 4000},
|
||||
"social_media": {"min": 500, "max": 1500},
|
||||
"default": {"min": 1000, "max": 5000},
|
||||
}
|
||||
return targets.get(report_style, targets["default"])
|
||||
@@ -35,6 +35,7 @@ from src.podcast.graph.builder import build_graph as build_podcast_graph
|
||||
from src.ppt.graph.builder import build_graph as build_ppt_graph
|
||||
from src.prompt_enhancer.graph.builder import build_graph as build_prompt_enhancer_graph
|
||||
from src.prose.graph.builder import build_graph as build_prose_graph
|
||||
from src.eval import ReportEvaluator
|
||||
from src.rag.builder import build_retriever
|
||||
from src.rag.milvus import load_examples as load_milvus_examples
|
||||
from src.rag.qdrant import load_examples as load_qdrant_examples
|
||||
@@ -47,6 +48,7 @@ from src.server.chat_request import (
|
||||
GenerateProseRequest,
|
||||
TTSRequest,
|
||||
)
|
||||
from src.server.eval_request import EvaluateReportRequest, EvaluateReportResponse
|
||||
from src.server.config_request import ConfigResponse
|
||||
from src.server.mcp_request import MCPServerMetadataRequest, MCPServerMetadataResponse
|
||||
from src.server.mcp_utils import load_mcp_tools
|
||||
@@ -946,6 +948,39 @@ async def generate_prose(request: GenerateProseRequest):
|
||||
raise HTTPException(status_code=500, detail=INTERNAL_SERVER_ERROR_DETAIL)
|
||||
|
||||
|
||||
@app.post("/api/report/evaluate", response_model=EvaluateReportResponse)
|
||||
async def evaluate_report(request: EvaluateReportRequest):
|
||||
"""Evaluate report quality using automated metrics and optionally LLM-as-Judge."""
|
||||
try:
|
||||
evaluator = ReportEvaluator(use_llm=request.use_llm)
|
||||
|
||||
if request.use_llm:
|
||||
result = await evaluator.evaluate(
|
||||
request.content, request.query, request.report_style or "default"
|
||||
)
|
||||
return EvaluateReportResponse(
|
||||
metrics=result.metrics.to_dict(),
|
||||
score=result.final_score,
|
||||
grade=result.grade,
|
||||
llm_evaluation=result.llm_evaluation.to_dict()
|
||||
if result.llm_evaluation
|
||||
else None,
|
||||
summary=result.summary,
|
||||
)
|
||||
else:
|
||||
result = evaluator.evaluate_metrics_only(
|
||||
request.content, request.report_style or "default"
|
||||
)
|
||||
return EvaluateReportResponse(
|
||||
metrics=result["metrics"],
|
||||
score=result["score"],
|
||||
grade=result["grade"],
|
||||
)
|
||||
except Exception as e:
|
||||
logger.exception(f"Error occurred during report evaluation: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=INTERNAL_SERVER_ERROR_DETAIL)
|
||||
|
||||
|
||||
@app.post("/api/prompt/enhance")
|
||||
async def enhance_prompt(request: EnhancePromptRequest):
|
||||
try:
|
||||
|
||||
71
src/server/eval_request.py
Normal file
71
src/server/eval_request.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""Request models for report evaluation endpoint."""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class EvaluateReportRequest(BaseModel):
|
||||
"""Request model for report evaluation."""
|
||||
|
||||
content: str = Field(description="Report markdown content to evaluate")
|
||||
query: str = Field(description="Original research query")
|
||||
report_style: Optional[str] = Field(
|
||||
default="default", description="Report style (academic, news, etc.)"
|
||||
)
|
||||
use_llm: bool = Field(
|
||||
default=False,
|
||||
description="Whether to use LLM for deep evaluation (slower but more detailed)",
|
||||
)
|
||||
|
||||
|
||||
class EvaluationMetrics(BaseModel):
|
||||
"""Automated metrics result."""
|
||||
|
||||
word_count: int
|
||||
citation_count: int
|
||||
unique_sources: int
|
||||
image_count: int
|
||||
section_count: int
|
||||
section_coverage_score: float
|
||||
sections_found: list[str]
|
||||
sections_missing: list[str]
|
||||
has_title: bool
|
||||
has_key_points: bool
|
||||
has_overview: bool
|
||||
has_citations_section: bool
|
||||
|
||||
|
||||
class LLMEvaluationScores(BaseModel):
|
||||
"""LLM evaluation scores."""
|
||||
|
||||
factual_accuracy: int = 0
|
||||
completeness: int = 0
|
||||
coherence: int = 0
|
||||
relevance: int = 0
|
||||
citation_quality: int = 0
|
||||
writing_quality: int = 0
|
||||
|
||||
|
||||
class LLMEvaluation(BaseModel):
|
||||
"""LLM evaluation result."""
|
||||
|
||||
scores: LLMEvaluationScores
|
||||
overall_score: float
|
||||
weighted_score: float
|
||||
strengths: list[str]
|
||||
weaknesses: list[str]
|
||||
suggestions: list[str]
|
||||
|
||||
|
||||
class EvaluateReportResponse(BaseModel):
|
||||
"""Response model for report evaluation."""
|
||||
|
||||
metrics: EvaluationMetrics
|
||||
score: float
|
||||
grade: str
|
||||
llm_evaluation: Optional[LLMEvaluation] = None
|
||||
summary: Optional[str] = None
|
||||
2
tests/unit/eval/__init__.py
Normal file
2
tests/unit/eval/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
489
tests/unit/eval/test_evaluator.py
Normal file
489
tests/unit/eval/test_evaluator.py
Normal file
@@ -0,0 +1,489 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""Unit tests for the combined report evaluator."""
|
||||
|
||||
import json
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from src.eval.evaluator import CombinedEvaluation, ReportEvaluator, score_to_grade
|
||||
from src.eval.llm_judge import (
|
||||
EVALUATION_CRITERIA,
|
||||
MAX_REPORT_LENGTH,
|
||||
EvaluationResult,
|
||||
LLMJudge,
|
||||
)
|
||||
from src.eval.metrics import ReportMetrics
|
||||
|
||||
|
||||
class TestScoreToGrade:
|
||||
"""Tests for score to grade conversion."""
|
||||
|
||||
def test_excellent_scores(self):
|
||||
assert score_to_grade(9.5) == "A+"
|
||||
assert score_to_grade(9.0) == "A+"
|
||||
assert score_to_grade(8.7) == "A"
|
||||
assert score_to_grade(8.5) == "A"
|
||||
assert score_to_grade(8.2) == "A-"
|
||||
|
||||
def test_good_scores(self):
|
||||
assert score_to_grade(7.8) == "B+"
|
||||
assert score_to_grade(7.5) == "B+"
|
||||
assert score_to_grade(7.2) == "B"
|
||||
assert score_to_grade(7.0) == "B"
|
||||
assert score_to_grade(6.7) == "B-"
|
||||
|
||||
def test_average_scores(self):
|
||||
assert score_to_grade(6.2) == "C+"
|
||||
assert score_to_grade(5.8) == "C"
|
||||
assert score_to_grade(5.5) == "C"
|
||||
assert score_to_grade(5.2) == "C-"
|
||||
|
||||
def test_poor_scores(self):
|
||||
assert score_to_grade(4.5) == "D"
|
||||
assert score_to_grade(4.0) == "D"
|
||||
assert score_to_grade(3.0) == "F"
|
||||
assert score_to_grade(1.0) == "F"
|
||||
|
||||
|
||||
class TestReportEvaluator:
|
||||
"""Tests for ReportEvaluator class."""
|
||||
|
||||
@pytest.fixture
|
||||
def evaluator(self):
|
||||
"""Create evaluator without LLM for metrics-only tests."""
|
||||
return ReportEvaluator(use_llm=False)
|
||||
|
||||
@pytest.fixture
|
||||
def sample_report(self):
|
||||
"""Sample report for testing."""
|
||||
return """
|
||||
# Comprehensive Research Report
|
||||
|
||||
## Key Points
|
||||
- Important finding number one with significant implications
|
||||
- Critical discovery that changes our understanding
|
||||
- Key insight that provides actionable recommendations
|
||||
- Notable observation from the research data
|
||||
|
||||
## Overview
|
||||
This report presents a comprehensive analysis of the research topic.
|
||||
The findings are based on extensive data collection and analysis.
|
||||
|
||||
## Detailed Analysis
|
||||
|
||||
### Section 1: Background
|
||||
The background of this research involves multiple factors.
|
||||
[Source 1](https://example.com/source1) provides foundational context.
|
||||
|
||||
### Section 2: Methodology
|
||||
Our methodology follows established research practices.
|
||||
[Source 2](https://research.org/methods) outlines the approach.
|
||||
|
||||
### Section 3: Findings
|
||||
The key findings include several important discoveries.
|
||||

|
||||
|
||||
[Source 3](https://academic.edu/paper) supports these conclusions.
|
||||
|
||||
## Key Citations
|
||||
- [Example Source](https://example.com/source1)
|
||||
- [Research Methods](https://research.org/methods)
|
||||
- [Academic Paper](https://academic.edu/paper)
|
||||
- [Additional Reference](https://reference.com/doc)
|
||||
"""
|
||||
|
||||
def test_evaluate_metrics_only(self, evaluator, sample_report):
|
||||
"""Test metrics-only evaluation."""
|
||||
result = evaluator.evaluate_metrics_only(sample_report)
|
||||
|
||||
assert "metrics" in result
|
||||
assert "score" in result
|
||||
assert "grade" in result
|
||||
assert result["score"] > 0
|
||||
assert result["grade"] in ["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D", "F"]
|
||||
|
||||
def test_evaluate_metrics_only_structure(self, evaluator, sample_report):
|
||||
"""Test that metrics contain expected fields."""
|
||||
result = evaluator.evaluate_metrics_only(sample_report)
|
||||
metrics = result["metrics"]
|
||||
|
||||
assert "word_count" in metrics
|
||||
assert "citation_count" in metrics
|
||||
assert "unique_sources" in metrics
|
||||
assert "image_count" in metrics
|
||||
assert "section_coverage_score" in metrics
|
||||
|
||||
def test_evaluate_minimal_report(self, evaluator):
|
||||
"""Test evaluation of minimal report."""
|
||||
minimal_report = "Just some text."
|
||||
result = evaluator.evaluate_metrics_only(minimal_report)
|
||||
|
||||
assert result["score"] < 5.0
|
||||
assert result["grade"] in ["D", "F"]
|
||||
|
||||
def test_metrics_score_calculation(self, evaluator):
|
||||
"""Test that metrics score is calculated correctly."""
|
||||
good_report = """
|
||||
# Title
|
||||
|
||||
## Key Points
|
||||
- Point 1
|
||||
- Point 2
|
||||
|
||||
## Overview
|
||||
Overview content here.
|
||||
|
||||
## Detailed Analysis
|
||||
Analysis with [cite](https://a.com) and [cite2](https://b.com)
|
||||
and [cite3](https://c.com) and more [refs](https://d.com).
|
||||
|
||||

|
||||
|
||||
## Key Citations
|
||||
- [A](https://a.com)
|
||||
- [B](https://b.com)
|
||||
"""
|
||||
result = evaluator.evaluate_metrics_only(good_report)
|
||||
assert result["score"] > 5.0
|
||||
|
||||
def test_combined_evaluation_to_dict(self):
|
||||
"""Test CombinedEvaluation to_dict method."""
|
||||
metrics = ReportMetrics(
|
||||
word_count=1000,
|
||||
citation_count=5,
|
||||
unique_sources=3,
|
||||
)
|
||||
evaluation = CombinedEvaluation(
|
||||
metrics=metrics,
|
||||
llm_evaluation=None,
|
||||
final_score=7.5,
|
||||
grade="B+",
|
||||
summary="Test summary",
|
||||
)
|
||||
|
||||
result = evaluation.to_dict()
|
||||
assert result["final_score"] == 7.5
|
||||
assert result["grade"] == "B+"
|
||||
assert result["metrics"]["word_count"] == 1000
|
||||
|
||||
|
||||
class TestReportEvaluatorIntegration:
|
||||
"""Integration tests for evaluator (may require LLM)."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_full_evaluation_without_llm(self):
|
||||
"""Test full evaluation with LLM disabled."""
|
||||
evaluator = ReportEvaluator(use_llm=False)
|
||||
|
||||
report = """
|
||||
# Test Report
|
||||
|
||||
## Key Points
|
||||
- Key point 1
|
||||
|
||||
## Overview
|
||||
Test overview.
|
||||
|
||||
## Key Citations
|
||||
- [Test](https://test.com)
|
||||
"""
|
||||
|
||||
result = await evaluator.evaluate(report, "test query")
|
||||
|
||||
assert isinstance(result, CombinedEvaluation)
|
||||
assert result.final_score > 0
|
||||
assert result.grade is not None
|
||||
assert result.summary is not None
|
||||
assert result.llm_evaluation is None
|
||||
|
||||
|
||||
class TestLLMJudgeParseResponse:
|
||||
"""Tests for LLMJudge._parse_response method."""
|
||||
|
||||
@pytest.fixture
|
||||
def judge(self):
|
||||
"""Create LLMJudge with mock LLM."""
|
||||
return LLMJudge(llm=MagicMock())
|
||||
|
||||
@pytest.fixture
|
||||
def valid_response_data(self):
|
||||
"""Valid evaluation response data."""
|
||||
return {
|
||||
"scores": {
|
||||
"factual_accuracy": 8,
|
||||
"completeness": 7,
|
||||
"coherence": 9,
|
||||
"relevance": 8,
|
||||
"citation_quality": 6,
|
||||
"writing_quality": 8,
|
||||
},
|
||||
"overall_score": 8,
|
||||
"strengths": ["Well researched", "Clear structure"],
|
||||
"weaknesses": ["Could use more citations"],
|
||||
"suggestions": ["Add more sources"],
|
||||
}
|
||||
|
||||
def test_parse_valid_json(self, judge, valid_response_data):
|
||||
"""Test parsing valid JSON response."""
|
||||
response = json.dumps(valid_response_data)
|
||||
result = judge._parse_response(response)
|
||||
|
||||
assert result["scores"]["factual_accuracy"] == 8
|
||||
assert result["overall_score"] == 8
|
||||
assert "Well researched" in result["strengths"]
|
||||
|
||||
def test_parse_json_in_markdown_block(self, judge, valid_response_data):
|
||||
"""Test parsing JSON wrapped in markdown code block."""
|
||||
response = f"```json\n{json.dumps(valid_response_data)}\n```"
|
||||
result = judge._parse_response(response)
|
||||
|
||||
assert result["scores"]["coherence"] == 9
|
||||
assert result["overall_score"] == 8
|
||||
|
||||
def test_parse_json_in_generic_code_block(self, judge, valid_response_data):
|
||||
"""Test parsing JSON in generic code block."""
|
||||
response = f"```\n{json.dumps(valid_response_data)}\n```"
|
||||
result = judge._parse_response(response)
|
||||
|
||||
assert result["scores"]["relevance"] == 8
|
||||
|
||||
def test_parse_malformed_json_returns_defaults(self, judge):
|
||||
"""Test that malformed JSON returns default scores."""
|
||||
response = "This is not valid JSON at all"
|
||||
result = judge._parse_response(response)
|
||||
|
||||
assert result["scores"]["factual_accuracy"] == 5
|
||||
assert result["scores"]["completeness"] == 5
|
||||
assert result["overall_score"] == 5
|
||||
assert "Unable to parse evaluation" in result["strengths"]
|
||||
assert "Evaluation parsing failed" in result["weaknesses"]
|
||||
|
||||
def test_parse_incomplete_json(self, judge):
|
||||
"""Test parsing incomplete JSON."""
|
||||
response = '{"scores": {"factual_accuracy": 8}' # Missing closing braces
|
||||
result = judge._parse_response(response)
|
||||
|
||||
# Should return defaults due to parse failure
|
||||
assert result["overall_score"] == 5
|
||||
|
||||
def test_parse_json_with_extra_text(self, judge, valid_response_data):
|
||||
"""Test parsing JSON with surrounding text."""
|
||||
response = f"Here is my evaluation:\n```json\n{json.dumps(valid_response_data)}\n```\nHope this helps!"
|
||||
result = judge._parse_response(response)
|
||||
|
||||
assert result["scores"]["factual_accuracy"] == 8
|
||||
|
||||
|
||||
class TestLLMJudgeCalculateWeightedScore:
|
||||
"""Tests for LLMJudge._calculate_weighted_score method."""
|
||||
|
||||
@pytest.fixture
|
||||
def judge(self):
|
||||
"""Create LLMJudge with mock LLM."""
|
||||
return LLMJudge(llm=MagicMock())
|
||||
|
||||
def test_calculate_with_all_scores(self, judge):
|
||||
"""Test weighted score calculation with all criteria."""
|
||||
scores = {
|
||||
"factual_accuracy": 10, # weight 0.25
|
||||
"completeness": 10, # weight 0.20
|
||||
"coherence": 10, # weight 0.20
|
||||
"relevance": 10, # weight 0.15
|
||||
"citation_quality": 10, # weight 0.10
|
||||
"writing_quality": 10, # weight 0.10
|
||||
}
|
||||
result = judge._calculate_weighted_score(scores)
|
||||
assert result == 10.0
|
||||
|
||||
def test_calculate_with_varied_scores(self, judge):
|
||||
"""Test weighted score with varied scores."""
|
||||
scores = {
|
||||
"factual_accuracy": 8, # 8 * 0.25 = 2.0
|
||||
"completeness": 6, # 6 * 0.20 = 1.2
|
||||
"coherence": 7, # 7 * 0.20 = 1.4
|
||||
"relevance": 9, # 9 * 0.15 = 1.35
|
||||
"citation_quality": 5, # 5 * 0.10 = 0.5
|
||||
"writing_quality": 8, # 8 * 0.10 = 0.8
|
||||
}
|
||||
# Total: 7.25
|
||||
result = judge._calculate_weighted_score(scores)
|
||||
assert result == 7.25
|
||||
|
||||
def test_calculate_with_partial_scores(self, judge):
|
||||
"""Test weighted score with only some criteria."""
|
||||
scores = {
|
||||
"factual_accuracy": 8, # weight 0.25
|
||||
"completeness": 6, # weight 0.20
|
||||
}
|
||||
# (8 * 0.25 + 6 * 0.20) / (0.25 + 0.20) = 3.2 / 0.45 = 7.11
|
||||
result = judge._calculate_weighted_score(scores)
|
||||
assert abs(result - 7.11) < 0.01
|
||||
|
||||
def test_calculate_with_unknown_criteria(self, judge):
|
||||
"""Test that unknown criteria are ignored."""
|
||||
scores = {
|
||||
"factual_accuracy": 10,
|
||||
"unknown_criterion": 1, # Should be ignored
|
||||
}
|
||||
result = judge._calculate_weighted_score(scores)
|
||||
assert result == 10.0
|
||||
|
||||
def test_calculate_with_empty_scores(self, judge):
|
||||
"""Test with empty scores dict."""
|
||||
result = judge._calculate_weighted_score({})
|
||||
assert result == 0.0
|
||||
|
||||
def test_weights_sum_to_one(self):
|
||||
"""Verify that all criteria weights sum to 1.0."""
|
||||
total_weight = sum(c["weight"] for c in EVALUATION_CRITERIA.values())
|
||||
assert abs(total_weight - 1.0) < 0.001
|
||||
|
||||
|
||||
class TestLLMJudgeEvaluate:
|
||||
"""Tests for LLMJudge.evaluate method with mocked LLM."""
|
||||
|
||||
@pytest.fixture
|
||||
def valid_llm_response(self):
|
||||
"""Create a valid LLM response."""
|
||||
return json.dumps(
|
||||
{
|
||||
"scores": {
|
||||
"factual_accuracy": 8,
|
||||
"completeness": 7,
|
||||
"coherence": 9,
|
||||
"relevance": 8,
|
||||
"citation_quality": 7,
|
||||
"writing_quality": 8,
|
||||
},
|
||||
"overall_score": 8,
|
||||
"strengths": ["Comprehensive coverage", "Well structured"],
|
||||
"weaknesses": ["Some claims need more support"],
|
||||
"suggestions": ["Add more academic sources"],
|
||||
}
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_successful_evaluation(self, valid_llm_response):
|
||||
"""Test successful LLM evaluation."""
|
||||
mock_llm = AsyncMock()
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = valid_llm_response
|
||||
mock_llm.ainvoke.return_value = mock_response
|
||||
|
||||
judge = LLMJudge(llm=mock_llm)
|
||||
result = await judge.evaluate("Test report", "Test query")
|
||||
|
||||
assert isinstance(result, EvaluationResult)
|
||||
assert result.scores["factual_accuracy"] == 8
|
||||
assert result.overall_score == 8
|
||||
assert result.weighted_score > 0
|
||||
assert "Comprehensive coverage" in result.strengths
|
||||
assert result.raw_response == valid_llm_response
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluation_with_llm_failure(self):
|
||||
"""Test that LLM failures are handled gracefully."""
|
||||
mock_llm = AsyncMock()
|
||||
mock_llm.ainvoke.side_effect = Exception("LLM service unavailable")
|
||||
|
||||
judge = LLMJudge(llm=mock_llm)
|
||||
result = await judge.evaluate("Test report", "Test query")
|
||||
|
||||
assert isinstance(result, EvaluationResult)
|
||||
assert result.overall_score == 0
|
||||
assert result.weighted_score == 0
|
||||
assert all(score == 0 for score in result.scores.values())
|
||||
assert any("failed" in w.lower() for w in result.weaknesses)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluation_with_malformed_response(self):
|
||||
"""Test handling of malformed LLM response."""
|
||||
mock_llm = AsyncMock()
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = "I cannot evaluate this report properly."
|
||||
mock_llm.ainvoke.return_value = mock_response
|
||||
|
||||
judge = LLMJudge(llm=mock_llm)
|
||||
result = await judge.evaluate("Test report", "Test query")
|
||||
|
||||
# Should return default scores
|
||||
assert result.scores["factual_accuracy"] == 5
|
||||
assert result.overall_score == 5
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluation_passes_report_style(self):
|
||||
"""Test that report_style is passed to LLM."""
|
||||
mock_llm = AsyncMock()
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = json.dumps(
|
||||
{
|
||||
"scores": {k: 7 for k in EVALUATION_CRITERIA.keys()},
|
||||
"overall_score": 7,
|
||||
"strengths": [],
|
||||
"weaknesses": [],
|
||||
"suggestions": [],
|
||||
}
|
||||
)
|
||||
mock_llm.ainvoke.return_value = mock_response
|
||||
|
||||
judge = LLMJudge(llm=mock_llm)
|
||||
await judge.evaluate("Test report", "Test query", report_style="academic")
|
||||
|
||||
# Verify the prompt contains the report style
|
||||
call_args = mock_llm.ainvoke.call_args
|
||||
messages = call_args[0][0]
|
||||
user_message_content = messages[1].content
|
||||
assert "academic" in user_message_content
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluation_truncates_long_reports(self):
|
||||
"""Test that very long reports are truncated."""
|
||||
mock_llm = AsyncMock()
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = json.dumps(
|
||||
{
|
||||
"scores": {k: 7 for k in EVALUATION_CRITERIA.keys()},
|
||||
"overall_score": 7,
|
||||
"strengths": [],
|
||||
"weaknesses": [],
|
||||
"suggestions": [],
|
||||
}
|
||||
)
|
||||
mock_llm.ainvoke.return_value = mock_response
|
||||
|
||||
judge = LLMJudge(llm=mock_llm)
|
||||
long_report = "x" * (MAX_REPORT_LENGTH + 5000)
|
||||
await judge.evaluate(long_report, "Test query")
|
||||
|
||||
call_args = mock_llm.ainvoke.call_args
|
||||
messages = call_args[0][0]
|
||||
user_message_content = messages[1].content
|
||||
# The report content in the message should be truncated to MAX_REPORT_LENGTH
|
||||
assert len(user_message_content) < len(long_report) + 500
|
||||
|
||||
|
||||
class TestEvaluationResult:
|
||||
"""Tests for EvaluationResult dataclass."""
|
||||
|
||||
def test_to_dict(self):
|
||||
"""Test EvaluationResult.to_dict method."""
|
||||
result = EvaluationResult(
|
||||
scores={"factual_accuracy": 8, "completeness": 7},
|
||||
overall_score=7.5,
|
||||
weighted_score=7.6,
|
||||
strengths=["Good research"],
|
||||
weaknesses=["Needs more detail"],
|
||||
suggestions=["Expand section 2"],
|
||||
raw_response="test response",
|
||||
)
|
||||
|
||||
d = result.to_dict()
|
||||
assert d["scores"]["factual_accuracy"] == 8
|
||||
assert d["overall_score"] == 7.5
|
||||
assert d["weighted_score"] == 7.6
|
||||
assert "Good research" in d["strengths"]
|
||||
# raw_response should not be in dict
|
||||
assert "raw_response" not in d
|
||||
207
tests/unit/eval/test_metrics.py
Normal file
207
tests/unit/eval/test_metrics.py
Normal file
@@ -0,0 +1,207 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""Unit tests for report evaluation metrics."""
|
||||
|
||||
from src.eval.metrics import (
|
||||
compute_metrics,
|
||||
count_citations,
|
||||
count_images,
|
||||
count_words,
|
||||
detect_sections,
|
||||
extract_domains,
|
||||
get_word_count_target,
|
||||
)
|
||||
|
||||
|
||||
class TestCountWords:
|
||||
"""Tests for word counting function."""
|
||||
|
||||
def test_english_words(self):
|
||||
text = "This is a simple test sentence."
|
||||
assert count_words(text) == 6
|
||||
|
||||
def test_chinese_characters(self):
|
||||
text = "这是一个测试"
|
||||
assert count_words(text) == 6
|
||||
|
||||
def test_mixed_content(self):
|
||||
text = "Hello 你好 World 世界"
|
||||
assert count_words(text) == 4 + 2 # 2 English + 4 Chinese
|
||||
|
||||
def test_empty_string(self):
|
||||
assert count_words("") == 0
|
||||
|
||||
|
||||
class TestCountCitations:
|
||||
"""Tests for citation counting function."""
|
||||
|
||||
def test_markdown_citations(self):
|
||||
text = """
|
||||
Check out [Google](https://google.com) and [GitHub](https://github.com).
|
||||
"""
|
||||
assert count_citations(text) == 2
|
||||
|
||||
def test_no_citations(self):
|
||||
text = "This is plain text without any links."
|
||||
assert count_citations(text) == 0
|
||||
|
||||
def test_invalid_urls(self):
|
||||
text = "[Link](not-a-url) [Another](ftp://ftp.example.com)"
|
||||
assert count_citations(text) == 0
|
||||
|
||||
def test_complex_urls(self):
|
||||
text = "[Article](https://example.com/path/to/article?id=123&ref=test)"
|
||||
assert count_citations(text) == 1
|
||||
|
||||
|
||||
class TestExtractDomains:
|
||||
"""Tests for domain extraction function."""
|
||||
|
||||
def test_extract_multiple_domains(self):
|
||||
text = """
|
||||
https://google.com/search
|
||||
https://www.github.com/user/repo
|
||||
https://docs.python.org/3/
|
||||
"""
|
||||
domains = extract_domains(text)
|
||||
assert len(domains) == 3
|
||||
assert "google.com" in domains
|
||||
assert "github.com" in domains
|
||||
assert "docs.python.org" in domains
|
||||
|
||||
def test_deduplicate_domains(self):
|
||||
text = """
|
||||
https://example.com/page1
|
||||
https://example.com/page2
|
||||
https://www.example.com/page3
|
||||
"""
|
||||
domains = extract_domains(text)
|
||||
assert len(domains) == 1
|
||||
assert "example.com" in domains
|
||||
|
||||
def test_no_urls(self):
|
||||
text = "Plain text without URLs"
|
||||
assert extract_domains(text) == []
|
||||
|
||||
|
||||
class TestCountImages:
|
||||
"""Tests for image counting function."""
|
||||
|
||||
def test_markdown_images(self):
|
||||
text = """
|
||||

|
||||

|
||||
"""
|
||||
assert count_images(text) == 2
|
||||
|
||||
def test_no_images(self):
|
||||
text = "Text without images [link](url)"
|
||||
assert count_images(text) == 0
|
||||
|
||||
|
||||
class TestDetectSections:
|
||||
"""Tests for section detection function."""
|
||||
|
||||
def test_detect_title(self):
|
||||
text = "# My Report Title\n\nSome content here."
|
||||
sections = detect_sections(text)
|
||||
assert sections.get("title") is True
|
||||
|
||||
def test_detect_key_points(self):
|
||||
text = "## Key Points\n- Point 1\n- Point 2"
|
||||
sections = detect_sections(text)
|
||||
assert sections.get("key_points") is True
|
||||
|
||||
def test_detect_chinese_sections(self):
|
||||
text = """# 报告标题
|
||||
## 要点
|
||||
- 要点1
|
||||
## 概述
|
||||
这是概述内容
|
||||
"""
|
||||
sections = detect_sections(text)
|
||||
assert sections.get("title") is True
|
||||
assert sections.get("key_points") is True
|
||||
assert sections.get("overview") is True
|
||||
|
||||
def test_detect_citations_section(self):
|
||||
text = """
|
||||
## Key Citations
|
||||
- [Source 1](https://example.com)
|
||||
"""
|
||||
sections = detect_sections(text)
|
||||
assert sections.get("key_citations") is True
|
||||
|
||||
|
||||
class TestComputeMetrics:
|
||||
"""Tests for the main compute_metrics function."""
|
||||
|
||||
def test_complete_report(self):
|
||||
report = """
|
||||
# Research Report Title
|
||||
|
||||
## Key Points
|
||||
- Point 1
|
||||
- Point 2
|
||||
- Point 3
|
||||
|
||||
## Overview
|
||||
This is an overview of the research topic.
|
||||
|
||||
## Detailed Analysis
|
||||
Here is the detailed analysis with [source](https://example.com).
|
||||
|
||||

|
||||
|
||||
## Key Citations
|
||||
- [Source 1](https://example.com)
|
||||
- [Source 2](https://another.com)
|
||||
"""
|
||||
metrics = compute_metrics(report)
|
||||
|
||||
assert metrics.has_title is True
|
||||
assert metrics.has_key_points is True
|
||||
assert metrics.has_overview is True
|
||||
assert metrics.has_citations_section is True
|
||||
assert metrics.citation_count >= 2
|
||||
assert metrics.image_count == 1
|
||||
assert metrics.unique_sources >= 1
|
||||
assert metrics.section_coverage_score > 0.5
|
||||
|
||||
def test_minimal_report(self):
|
||||
report = "Just some text without structure."
|
||||
metrics = compute_metrics(report)
|
||||
|
||||
assert metrics.has_title is False
|
||||
assert metrics.citation_count == 0
|
||||
assert metrics.section_coverage_score < 0.5
|
||||
|
||||
def test_metrics_to_dict(self):
|
||||
report = "# Title\n\nSome content"
|
||||
metrics = compute_metrics(report)
|
||||
result = metrics.to_dict()
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert "word_count" in result
|
||||
assert "citation_count" in result
|
||||
assert "section_coverage_score" in result
|
||||
|
||||
|
||||
class TestGetWordCountTarget:
|
||||
"""Tests for word count target function."""
|
||||
|
||||
def test_strategic_investment_target(self):
|
||||
target = get_word_count_target("strategic_investment")
|
||||
assert target["min"] == 10000
|
||||
assert target["max"] == 15000
|
||||
|
||||
def test_news_target(self):
|
||||
target = get_word_count_target("news")
|
||||
assert target["min"] == 800
|
||||
assert target["max"] == 2000
|
||||
|
||||
def test_default_target(self):
|
||||
target = get_word_count_target("unknown_style")
|
||||
assert target["min"] == 1000
|
||||
assert target["max"] == 5000
|
||||
@@ -150,6 +150,7 @@
|
||||
"downloadWord": "Word (.docx)",
|
||||
"downloadImage": "Image (.png)",
|
||||
"exportFailed": "Export failed, please try again",
|
||||
"evaluateReport": "Evaluate report quality",
|
||||
"searchingFor": "Searching for",
|
||||
"reading": "Reading",
|
||||
"runningPythonCode": "Running Python code",
|
||||
@@ -163,6 +164,31 @@
|
||||
"errorGeneratingPodcast": "Error when generating podcast. Please try again.",
|
||||
"downloadPodcast": "Download podcast"
|
||||
},
|
||||
"evaluation": {
|
||||
"title": "Report Quality Evaluation",
|
||||
"description": "Evaluate your report using automated metrics and AI analysis.",
|
||||
"evaluating": "Evaluating report...",
|
||||
"analyzing": "Running deep analysis...",
|
||||
"overallScore": "Overall Score",
|
||||
"metrics": "Report Metrics",
|
||||
"wordCount": "Word Count",
|
||||
"citations": "Citations",
|
||||
"sources": "Unique Sources",
|
||||
"images": "Images",
|
||||
"sectionCoverage": "Section Coverage",
|
||||
"detailedAnalysis": "Detailed Analysis",
|
||||
"deepEvaluation": "Deep Evaluation (AI)",
|
||||
"strengths": "Strengths",
|
||||
"weaknesses": "Areas for Improvement",
|
||||
"scores": {
|
||||
"factual_accuracy": "Factual Accuracy",
|
||||
"completeness": "Completeness",
|
||||
"coherence": "Coherence",
|
||||
"relevance": "Relevance",
|
||||
"citation_quality": "Citation Quality",
|
||||
"writing_quality": "Writing Quality"
|
||||
}
|
||||
},
|
||||
"messages": {
|
||||
"replaying": "Replaying",
|
||||
"replayDescription": "DeerFlow is now replaying the conversation...",
|
||||
|
||||
@@ -150,6 +150,7 @@
|
||||
"downloadWord": "Word (.docx)",
|
||||
"downloadImage": "图片 (.png)",
|
||||
"exportFailed": "导出失败,请重试",
|
||||
"evaluateReport": "评估报告质量",
|
||||
"searchingFor": "搜索",
|
||||
"reading": "阅读中",
|
||||
"runningPythonCode": "运行 Python 代码",
|
||||
@@ -163,6 +164,31 @@
|
||||
"errorGeneratingPodcast": "生成播客时出错。请重试。",
|
||||
"downloadPodcast": "下载播客"
|
||||
},
|
||||
"evaluation": {
|
||||
"title": "报告质量评估",
|
||||
"description": "使用自动化指标和 AI 分析评估您的报告。",
|
||||
"evaluating": "正在评估报告...",
|
||||
"analyzing": "正在进行深度分析...",
|
||||
"overallScore": "总体评分",
|
||||
"metrics": "报告指标",
|
||||
"wordCount": "字数",
|
||||
"citations": "引用数",
|
||||
"sources": "独立来源",
|
||||
"images": "图片数",
|
||||
"sectionCoverage": "章节覆盖率",
|
||||
"detailedAnalysis": "详细分析",
|
||||
"deepEvaluation": "深度评估 (AI)",
|
||||
"strengths": "优势",
|
||||
"weaknesses": "改进建议",
|
||||
"scores": {
|
||||
"factual_accuracy": "事实准确性",
|
||||
"completeness": "完整性",
|
||||
"coherence": "连贯性",
|
||||
"relevance": "相关性",
|
||||
"citation_quality": "引用质量",
|
||||
"writing_quality": "写作质量"
|
||||
}
|
||||
},
|
||||
"messages": {
|
||||
"replaying": "回放中",
|
||||
"replayDescription": "DeerFlow 正在回放对话...",
|
||||
|
||||
300
web/src/app/chat/components/evaluation-dialog.tsx
Normal file
300
web/src/app/chat/components/evaluation-dialog.tsx
Normal file
@@ -0,0 +1,300 @@
|
||||
// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
import {
|
||||
BookOpen,
|
||||
FileText,
|
||||
Image,
|
||||
Link2,
|
||||
Loader2,
|
||||
Sparkles,
|
||||
ThumbsDown,
|
||||
ThumbsUp,
|
||||
} from "lucide-react";
|
||||
import { useTranslations } from "next-intl";
|
||||
import { useCallback, useEffect, useRef, useState } from "react";
|
||||
|
||||
import { Button } from "~/components/ui/button";
|
||||
import {
|
||||
Dialog,
|
||||
DialogContent,
|
||||
DialogDescription,
|
||||
DialogHeader,
|
||||
DialogTitle,
|
||||
} from "~/components/ui/dialog";
|
||||
import { Progress } from "~/components/ui/progress";
|
||||
import { evaluateReport, type EvaluationResult } from "~/core/api";
|
||||
import { cn } from "~/lib/utils";
|
||||
|
||||
interface EvaluationDialogProps {
|
||||
open: boolean;
|
||||
onOpenChange: (open: boolean) => void;
|
||||
reportContent: string;
|
||||
query: string;
|
||||
reportStyle?: string;
|
||||
}
|
||||
|
||||
function GradeBadge({ grade }: { grade: string }) {
|
||||
const gradeColors: Record<string, string> = {
|
||||
"A+": "bg-emerald-500",
|
||||
A: "bg-emerald-500",
|
||||
"A-": "bg-emerald-400",
|
||||
"B+": "bg-blue-500",
|
||||
B: "bg-blue-500",
|
||||
"B-": "bg-blue-400",
|
||||
"C+": "bg-yellow-500",
|
||||
C: "bg-yellow-500",
|
||||
"C-": "bg-yellow-400",
|
||||
D: "bg-orange-500",
|
||||
F: "bg-red-500",
|
||||
};
|
||||
|
||||
return (
|
||||
<div
|
||||
aria-label={`Report grade: ${grade}`}
|
||||
className={cn(
|
||||
"flex h-16 w-16 items-center justify-center rounded-full text-2xl font-bold text-white",
|
||||
gradeColors[grade] ?? "bg-gray-500",
|
||||
)}
|
||||
>
|
||||
{grade}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function MetricItem({
|
||||
icon: Icon,
|
||||
label,
|
||||
value,
|
||||
suffix,
|
||||
}: {
|
||||
icon: React.ComponentType<{ className?: string }>;
|
||||
label: string;
|
||||
value: number | string;
|
||||
suffix?: string;
|
||||
}) {
|
||||
return (
|
||||
<div className="flex items-center gap-3">
|
||||
<Icon className="text-muted-foreground h-4 w-4" />
|
||||
<span className="text-muted-foreground text-sm">{label}</span>
|
||||
<span className="ml-auto font-medium">
|
||||
{value}
|
||||
{suffix}
|
||||
</span>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
export function EvaluationDialog({
|
||||
open,
|
||||
onOpenChange,
|
||||
reportContent,
|
||||
query,
|
||||
reportStyle,
|
||||
}: EvaluationDialogProps) {
|
||||
const t = useTranslations("chat.evaluation");
|
||||
const [loading, setLoading] = useState(false);
|
||||
const [deepLoading, setDeepLoading] = useState(false);
|
||||
const [result, setResult] = useState<EvaluationResult | null>(null);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
const hasRunInitialEvaluation = useRef(false);
|
||||
|
||||
const runEvaluation = useCallback(
|
||||
async (useLlm: boolean) => {
|
||||
if (useLlm) {
|
||||
setDeepLoading(true);
|
||||
} else {
|
||||
setLoading(true);
|
||||
}
|
||||
setError(null);
|
||||
|
||||
try {
|
||||
const evalResult = await evaluateReport(
|
||||
reportContent,
|
||||
query,
|
||||
reportStyle,
|
||||
useLlm,
|
||||
);
|
||||
setResult(evalResult);
|
||||
} catch (err) {
|
||||
setError(err instanceof Error ? err.message : "Evaluation failed");
|
||||
} finally {
|
||||
setLoading(false);
|
||||
setDeepLoading(false);
|
||||
}
|
||||
},
|
||||
[reportContent, query, reportStyle],
|
||||
);
|
||||
|
||||
useEffect(() => {
|
||||
if (open && !hasRunInitialEvaluation.current) {
|
||||
hasRunInitialEvaluation.current = true;
|
||||
void runEvaluation(false);
|
||||
}
|
||||
}, [open, runEvaluation]);
|
||||
|
||||
useEffect(() => {
|
||||
if (!open) {
|
||||
setResult(null);
|
||||
setError(null);
|
||||
hasRunInitialEvaluation.current = false;
|
||||
}
|
||||
}, [open]);
|
||||
|
||||
return (
|
||||
<Dialog open={open} onOpenChange={onOpenChange}>
|
||||
<DialogContent className="sm:max-w-md">
|
||||
<DialogHeader>
|
||||
<DialogTitle>{t("title")}</DialogTitle>
|
||||
<DialogDescription>{t("description")}</DialogDescription>
|
||||
</DialogHeader>
|
||||
|
||||
{loading && !result ? (
|
||||
<div className="flex flex-col items-center justify-center py-8">
|
||||
<Loader2 className="h-8 w-8 animate-spin text-blue-500" />
|
||||
<p className="text-muted-foreground mt-4 text-sm">
|
||||
{t("evaluating")}
|
||||
</p>
|
||||
</div>
|
||||
) : error ? (
|
||||
<div className="py-4 text-center text-red-500">{error}</div>
|
||||
) : result ? (
|
||||
<div className="space-y-6">
|
||||
{/* Grade and Score */}
|
||||
<div className="flex items-center gap-6">
|
||||
<GradeBadge grade={result.grade} />
|
||||
<div>
|
||||
<div className="text-3xl font-bold">{result.score}/10</div>
|
||||
<div className="text-muted-foreground text-sm">
|
||||
{t("overallScore")}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Metrics */}
|
||||
<div className="space-y-3">
|
||||
<h4 className="text-sm font-medium">{t("metrics")}</h4>
|
||||
<div className="bg-muted/50 space-y-2 rounded-lg p-3">
|
||||
<MetricItem
|
||||
icon={FileText}
|
||||
label={t("wordCount")}
|
||||
value={result.metrics.word_count.toLocaleString()}
|
||||
/>
|
||||
<MetricItem
|
||||
icon={Link2}
|
||||
label={t("citations")}
|
||||
value={result.metrics.citation_count}
|
||||
/>
|
||||
<MetricItem
|
||||
icon={BookOpen}
|
||||
label={t("sources")}
|
||||
value={result.metrics.unique_sources}
|
||||
/>
|
||||
<MetricItem
|
||||
icon={Image}
|
||||
label={t("images")}
|
||||
value={result.metrics.image_count}
|
||||
/>
|
||||
<div className="pt-2">
|
||||
<div className="mb-1 flex items-center justify-between text-sm">
|
||||
<span className="text-muted-foreground">
|
||||
{t("sectionCoverage")}
|
||||
</span>
|
||||
<span className="font-medium">
|
||||
{Math.round(result.metrics.section_coverage_score * 100)}%
|
||||
</span>
|
||||
</div>
|
||||
<Progress
|
||||
value={result.metrics.section_coverage_score * 100}
|
||||
className="h-2"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* LLM Evaluation Results */}
|
||||
{result.llm_evaluation && (
|
||||
<div className="space-y-3">
|
||||
<h4 className="text-sm font-medium">{t("detailedAnalysis")}</h4>
|
||||
|
||||
{/* LLM Scores */}
|
||||
<div className="bg-muted/50 grid grid-cols-2 gap-2 rounded-lg p-3 text-sm">
|
||||
{Object.entries(result.llm_evaluation.scores).map(
|
||||
([key, value]) => (
|
||||
<div key={key} className="flex justify-between">
|
||||
<span className="text-muted-foreground">
|
||||
{t(`scores.${key}`)}
|
||||
</span>
|
||||
<span className="font-medium">{value}/10</span>
|
||||
</div>
|
||||
),
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Strengths */}
|
||||
{result.llm_evaluation.strengths.length > 0 && (
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center gap-2 text-sm font-medium text-emerald-600">
|
||||
<ThumbsUp className="h-4 w-4" />
|
||||
{t("strengths")}
|
||||
</div>
|
||||
<ul className="space-y-1 text-sm">
|
||||
{result.llm_evaluation.strengths
|
||||
.slice(0, 3)
|
||||
.map((s, i) => (
|
||||
<li key={i} className="text-muted-foreground">
|
||||
• {s}
|
||||
</li>
|
||||
))}
|
||||
</ul>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Weaknesses */}
|
||||
{result.llm_evaluation.weaknesses.length > 0 && (
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center gap-2 text-sm font-medium text-orange-600">
|
||||
<ThumbsDown className="h-4 w-4" />
|
||||
{t("weaknesses")}
|
||||
</div>
|
||||
<ul className="space-y-1 text-sm">
|
||||
{result.llm_evaluation.weaknesses
|
||||
.slice(0, 3)
|
||||
.map((w, i) => (
|
||||
<li key={i} className="text-muted-foreground">
|
||||
• {w}
|
||||
</li>
|
||||
))}
|
||||
</ul>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Deep Evaluation Button */}
|
||||
{!result.llm_evaluation && (
|
||||
<Button
|
||||
variant="outline"
|
||||
className="w-full"
|
||||
onClick={() => runEvaluation(true)}
|
||||
disabled={deepLoading}
|
||||
>
|
||||
{deepLoading ? (
|
||||
<>
|
||||
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
|
||||
{t("analyzing")}
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
<Sparkles className="mr-2 h-4 w-4" />
|
||||
{t("deepEvaluation")}
|
||||
</>
|
||||
)}
|
||||
</Button>
|
||||
)}
|
||||
</div>
|
||||
) : null}
|
||||
</DialogContent>
|
||||
</Dialog>
|
||||
);
|
||||
}
|
||||
@@ -16,6 +16,7 @@ import { jsPDF } from "jspdf";
|
||||
import {
|
||||
Check,
|
||||
Copy,
|
||||
GraduationCap,
|
||||
Headphones,
|
||||
Pencil,
|
||||
Undo2,
|
||||
@@ -43,9 +44,10 @@ import {
|
||||
} from "~/components/ui/dropdown-menu";
|
||||
import { Tabs, TabsContent, TabsList, TabsTrigger } from "~/components/ui/tabs";
|
||||
import { useReplay } from "~/core/replay";
|
||||
import { closeResearch, listenToPodcast, useStore } from "~/core/store";
|
||||
import { closeResearch, getResearchQuery, listenToPodcast, useStore, useSettingsStore } from "~/core/store";
|
||||
import { cn } from "~/lib/utils";
|
||||
|
||||
import { EvaluationDialog } from "./evaluation-dialog";
|
||||
import { ResearchActivitiesBlock } from "./research-activities-block";
|
||||
import { ResearchReportBlock } from "./research-report-block";
|
||||
|
||||
@@ -84,6 +86,7 @@ export function ResearchBlock({
|
||||
const [editing, setEditing] = useState(false);
|
||||
const [isDownloading, setIsDownloading] = useState(false);
|
||||
const [copied, setCopied] = useState(false);
|
||||
const [showEvaluation, setShowEvaluation] = useState(false);
|
||||
const handleCopy = useCallback(() => {
|
||||
if (!reportId) {
|
||||
return;
|
||||
@@ -676,6 +679,16 @@ ${htmlContent}
|
||||
{copied ? <Check /> : <Copy />}
|
||||
</Button>
|
||||
</Tooltip>
|
||||
<Tooltip title={t("evaluateReport")}>
|
||||
<Button
|
||||
className="text-gray-400"
|
||||
size="icon"
|
||||
variant="ghost"
|
||||
onClick={() => setShowEvaluation(true)}
|
||||
>
|
||||
<GraduationCap />
|
||||
</Button>
|
||||
</Tooltip>
|
||||
<DropdownMenu>
|
||||
<Tooltip title={t("downloadReport")}>
|
||||
<DropdownMenuTrigger asChild>
|
||||
@@ -796,6 +809,19 @@ ${htmlContent}
|
||||
</TabsContent>
|
||||
</Tabs>
|
||||
</Card>
|
||||
|
||||
{/* Evaluation Dialog */}
|
||||
{reportId && researchId && (
|
||||
<EvaluationDialog
|
||||
open={showEvaluation}
|
||||
onOpenChange={setShowEvaluation}
|
||||
reportContent={
|
||||
useStore.getState().messages.get(reportId)?.content ?? ""
|
||||
}
|
||||
query={getResearchQuery(researchId)}
|
||||
reportStyle={useSettingsStore.getState().general.reportStyle.toLowerCase()}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
30
web/src/components/ui/progress.tsx
Normal file
30
web/src/components/ui/progress.tsx
Normal file
@@ -0,0 +1,30 @@
|
||||
"use client"
|
||||
|
||||
import * as React from "react"
|
||||
|
||||
import { cn } from "~/lib/utils"
|
||||
|
||||
interface ProgressProps extends React.HTMLAttributes<HTMLDivElement> {
|
||||
value?: number
|
||||
}
|
||||
|
||||
function Progress({ className, value = 0, ...props }: ProgressProps) {
|
||||
return (
|
||||
<div
|
||||
data-slot="progress"
|
||||
className={cn(
|
||||
"bg-primary/20 relative h-2 w-full overflow-hidden rounded-full",
|
||||
className
|
||||
)}
|
||||
{...props}
|
||||
>
|
||||
<div
|
||||
data-slot="progress-indicator"
|
||||
className="bg-primary h-full transition-all duration-300 ease-in-out"
|
||||
style={{ width: `${Math.min(100, Math.max(0, value))}%` }}
|
||||
/>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export { Progress }
|
||||
91
web/src/core/api/evaluate.ts
Normal file
91
web/src/core/api/evaluate.ts
Normal file
@@ -0,0 +1,91 @@
|
||||
// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
import { resolveServiceURL } from "./resolve-service-url";
|
||||
|
||||
/**
|
||||
* Report evaluation API client.
|
||||
*/
|
||||
|
||||
export interface EvaluationMetrics {
|
||||
word_count: number;
|
||||
citation_count: number;
|
||||
unique_sources: number;
|
||||
image_count: number;
|
||||
section_count: number;
|
||||
section_coverage_score: number;
|
||||
sections_found: string[];
|
||||
sections_missing: string[];
|
||||
has_title: boolean;
|
||||
has_key_points: boolean;
|
||||
has_overview: boolean;
|
||||
has_citations_section: boolean;
|
||||
}
|
||||
|
||||
export interface LLMEvaluationScores {
|
||||
factual_accuracy: number;
|
||||
completeness: number;
|
||||
coherence: number;
|
||||
relevance: number;
|
||||
citation_quality: number;
|
||||
writing_quality: number;
|
||||
}
|
||||
|
||||
export interface LLMEvaluation {
|
||||
scores: LLMEvaluationScores;
|
||||
overall_score: number;
|
||||
weighted_score: number;
|
||||
strengths: string[];
|
||||
weaknesses: string[];
|
||||
suggestions: string[];
|
||||
}
|
||||
|
||||
export interface EvaluationResult {
|
||||
metrics: EvaluationMetrics;
|
||||
score: number;
|
||||
grade: string;
|
||||
llm_evaluation?: LLMEvaluation;
|
||||
summary?: string;
|
||||
}
|
||||
|
||||
export interface EvaluateReportRequest {
|
||||
content: string;
|
||||
query: string;
|
||||
report_style?: string;
|
||||
use_llm?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate a report's quality using automated metrics and optionally LLM-as-Judge.
|
||||
*
|
||||
* @param content - Report markdown content
|
||||
* @param query - Original research query
|
||||
* @param reportStyle - Report style (academic, news, etc.)
|
||||
* @param useLlm - Whether to use LLM for deep evaluation
|
||||
* @returns Evaluation result with metrics, score, and grade
|
||||
*/
|
||||
export async function evaluateReport(
|
||||
content: string,
|
||||
query: string,
|
||||
reportStyle?: string,
|
||||
useLlm?: boolean,
|
||||
): Promise<EvaluationResult> {
|
||||
const response = await fetch(resolveServiceURL("report/evaluate"), {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
content,
|
||||
query,
|
||||
report_style: reportStyle ?? "default",
|
||||
use_llm: useLlm ?? false,
|
||||
} satisfies EvaluateReportRequest),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Evaluation failed: ${response.statusText}`);
|
||||
}
|
||||
|
||||
return response.json();
|
||||
}
|
||||
@@ -2,6 +2,7 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
export * from "./chat";
|
||||
export * from "./evaluate";
|
||||
export * from "./mcp";
|
||||
export * from "./podcast";
|
||||
export * from "./prompt-enhancer";
|
||||
|
||||
@@ -24,6 +24,7 @@ export const useStore = create<{
|
||||
researchPlanIds: Map<string, string>;
|
||||
researchReportIds: Map<string, string>;
|
||||
researchActivityIds: Map<string, string[]>;
|
||||
researchQueries: Map<string, string>;
|
||||
ongoingResearchId: string | null;
|
||||
openResearchId: string | null;
|
||||
|
||||
@@ -42,6 +43,7 @@ export const useStore = create<{
|
||||
researchPlanIds: new Map<string, string>(),
|
||||
researchReportIds: new Map<string, string>(),
|
||||
researchActivityIds: new Map<string, string[]>(),
|
||||
researchQueries: new Map<string, string>(),
|
||||
ongoingResearchId: null,
|
||||
openResearchId: null,
|
||||
|
||||
@@ -267,11 +269,17 @@ function getOngoingResearchId() {
|
||||
|
||||
function appendResearch(researchId: string) {
|
||||
let planMessage: Message | undefined;
|
||||
let userQuery: string | undefined;
|
||||
const reversedMessageIds = [...useStore.getState().messageIds].reverse();
|
||||
for (const messageId of reversedMessageIds) {
|
||||
const message = getMessage(messageId);
|
||||
if (message?.agent === "planner") {
|
||||
if (!planMessage && message?.agent === "planner") {
|
||||
planMessage = message;
|
||||
}
|
||||
if (!userQuery && message?.role === "user") {
|
||||
userQuery = message.content;
|
||||
}
|
||||
if (planMessage && userQuery) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -288,6 +296,10 @@ function appendResearch(researchId: string) {
|
||||
researchId,
|
||||
messageIds,
|
||||
),
|
||||
researchQueries: new Map(useStore.getState().researchQueries).set(
|
||||
researchId,
|
||||
userQuery ?? "",
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -394,6 +406,10 @@ export function useResearchMessage(researchId: string) {
|
||||
);
|
||||
}
|
||||
|
||||
export function getResearchQuery(researchId: string): string {
|
||||
return useStore.getState().researchQueries.get(researchId) ?? "";
|
||||
}
|
||||
|
||||
export function useMessage(messageId: string | null | undefined) {
|
||||
return useStore(
|
||||
useShallow((state) =>
|
||||
|
||||
Reference in New Issue
Block a user