feat(eval): add report quality evaluation module and UI integration (#776)

* feat(eval): add report quality evaluation module

Addresses issue #773 - How to evaluate generated report quality objectively.

This module provides two evaluation approaches:
1. Automated metrics (no LLM required):
   - Citation count and source diversity
   - Word count compliance per report style
   - Section structure validation
   - Image inclusion tracking

2. LLM-as-Judge evaluation:
   - Factual accuracy scoring
   - Completeness assessment
   - Coherence evaluation
   - Relevance and citation quality checks

The combined evaluator provides a final score (1-10) and letter grade (A+ to F).

Files added:
- src/eval/__init__.py
- src/eval/metrics.py
- src/eval/llm_judge.py
- src/eval/evaluator.py
- tests/unit/eval/test_metrics.py
- tests/unit/eval/test_evaluator.py

* feat(eval): integrate report evaluation with web UI

This commit adds the web UI integration for the evaluation module:

Backend:
- Add EvaluateReportRequest/Response models in src/server/eval_request.py
- Add /api/report/evaluate endpoint to src/server/app.py

Frontend:
- Add evaluateReport API function in web/src/core/api/evaluate.ts
- Create EvaluationDialog component with grade badge, metrics display,
  and optional LLM deep evaluation
- Add evaluation button (graduation cap icon) to research-block.tsx toolbar
- Add i18n translations for English and Chinese

The evaluation UI allows users to:
1. View quick metrics-only evaluation (instant)
2. Optionally run deep LLM-based evaluation for detailed analysis
3. See grade (A+ to F), score (1-10), and metric breakdown

* feat(eval): improve evaluation reliability and add LLM judge tests

- Extract MAX_REPORT_LENGTH constant in llm_judge.py for maintainability
- Add comprehensive unit tests for LLMJudge class (parse_response,
  calculate_weighted_score, evaluate with mocked LLM)
- Pass reportStyle prop to EvaluationDialog for accurate evaluation criteria
- Add researchQueries store map to reliably associate queries with research
- Add getResearchQuery helper to retrieve query by researchId
- Remove unused imports in test_metrics.py

* fix(eval): use resolveServiceURL for evaluate API endpoint

The evaluateReport function was using a relative URL '/api/report/evaluate'
which sent requests to the Next.js server instead of the FastAPI backend.
Changed to use resolveServiceURL() consistent with other API functions.

* fix: improve type accuracy and React hooks in evaluation components

- Fix get_word_count_target return type from Optional[Dict] to Dict since it always returns a value via default fallback
- Fix useEffect dependency issue in EvaluationDialog using useRef to prevent unwanted re-evaluations
- Add aria-label to GradeBadge for screen reader accessibility
This commit is contained in:
Willem Jiang
2025-12-25 21:55:48 +08:00
committed by GitHub
parent 84a7f7815c
commit 8d9d767051
17 changed files with 2103 additions and 2 deletions

21
src/eval/__init__.py Normal file
View File

@@ -0,0 +1,21 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""
Report Quality Evaluation Module for DeerFlow.
This module provides objective methods to evaluate generated report quality,
including automated metrics and LLM-based evaluation.
"""
from .evaluator import ReportEvaluator
from .metrics import ReportMetrics, compute_metrics
from .llm_judge import LLMJudge, evaluate_with_llm
__all__ = [
"ReportEvaluator",
"ReportMetrics",
"compute_metrics",
"LLMJudge",
"evaluate_with_llm",
]

249
src/eval/evaluator.py Normal file
View File

@@ -0,0 +1,249 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""
Combined report evaluator orchestrating both automated metrics and LLM evaluation.
"""
import logging
from dataclasses import dataclass
from typing import Any, Dict, Optional
from .llm_judge import EvaluationResult, LLMJudge
from .metrics import ReportMetrics, compute_metrics, get_word_count_target
logger = logging.getLogger(__name__)
@dataclass
class CombinedEvaluation:
"""Combined evaluation results from metrics and LLM judge."""
metrics: ReportMetrics
llm_evaluation: Optional[EvaluationResult]
final_score: float
grade: str
summary: str
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary format."""
return {
"metrics": self.metrics.to_dict(),
"llm_evaluation": (
self.llm_evaluation.to_dict() if self.llm_evaluation else None
),
"final_score": self.final_score,
"grade": self.grade,
"summary": self.summary,
}
def score_to_grade(score: float) -> str:
"""Convert numeric score to letter grade."""
if score >= 9.0:
return "A+"
elif score >= 8.5:
return "A"
elif score >= 8.0:
return "A-"
elif score >= 7.5:
return "B+"
elif score >= 7.0:
return "B"
elif score >= 6.5:
return "B-"
elif score >= 6.0:
return "C+"
elif score >= 5.5:
return "C"
elif score >= 5.0:
return "C-"
elif score >= 4.0:
return "D"
else:
return "F"
class ReportEvaluator:
"""
Combined report evaluator using both automated metrics and LLM-as-Judge.
This evaluator provides comprehensive report quality assessment by:
1. Computing automated metrics (fast, deterministic)
2. Running LLM-based evaluation (nuanced, contextual)
3. Combining both for a final score and grade
"""
def __init__(self, llm: Any = None, use_llm: bool = True):
"""
Initialize the evaluator.
Args:
llm: Optional LLM instance for LLM-as-Judge evaluation
use_llm: Whether to use LLM evaluation (can be disabled for speed)
"""
self.use_llm = use_llm
self.llm_judge = LLMJudge(llm=llm) if use_llm else None
def _compute_metrics_score(
self, metrics: ReportMetrics, report_style: str
) -> float:
"""
Convert automated metrics to a 0-10 score.
Scoring breakdown:
- Section coverage: 30%
- Citation quality: 25%
- Word count compliance: 20%
- Source diversity: 15%
- Image inclusion: 10%
"""
score = 0.0
section_score = metrics.section_coverage_score * 10
score += section_score * 0.30
citation_score = min(metrics.citation_count / 10, 1.0) * 10
score += citation_score * 0.25
target = get_word_count_target(report_style)
if target:
if target["min"] <= metrics.word_count <= target["max"]:
word_score = 10.0
elif metrics.word_count < target["min"]:
word_score = (metrics.word_count / target["min"]) * 8
else:
excess_ratio = metrics.word_count / target["max"]
word_score = max(10 - (excess_ratio - 1) * 5, 5)
score += word_score * 0.20
diversity_score = min(metrics.unique_sources / 5, 1.0) * 10
score += diversity_score * 0.15
image_score = min(metrics.image_count / 3, 1.0) * 10
score += image_score * 0.10
return round(score, 2)
def _generate_summary(
self,
metrics: ReportMetrics,
llm_eval: Optional[EvaluationResult],
final_score: float,
grade: str,
) -> str:
"""Generate a human-readable evaluation summary."""
lines = [f"Report Grade: {grade} ({final_score}/10)", ""]
lines.append("**Automated Metrics:**")
lines.append(f"- Word Count: {metrics.word_count}")
lines.append(f"- Citations: {metrics.citation_count}")
lines.append(f"- Unique Sources: {metrics.unique_sources}")
lines.append(f"- Images: {metrics.image_count}")
lines.append(
f"- Section Coverage: {metrics.section_coverage_score * 100:.0f}%"
)
if metrics.sections_missing:
lines.append(f"- Missing Sections: {', '.join(metrics.sections_missing)}")
if llm_eval:
lines.append("")
lines.append("**LLM Evaluation:**")
for criterion, score in llm_eval.scores.items():
lines.append(f"- {criterion.replace('_', ' ').title()}: {score}/10")
if llm_eval.strengths:
lines.append("")
lines.append("**Strengths:**")
for strength in llm_eval.strengths[:3]:
lines.append(f"- {strength}")
if llm_eval.weaknesses:
lines.append("")
lines.append("**Areas for Improvement:**")
for weakness in llm_eval.weaknesses[:3]:
lines.append(f"- {weakness}")
return "\n".join(lines)
async def evaluate(
self,
report: str,
query: str,
report_style: str = "default",
) -> CombinedEvaluation:
"""
Evaluate a report using both metrics and LLM.
Args:
report: The report text to evaluate
query: The original research query
report_style: The style of report
Returns:
CombinedEvaluation with full results
"""
metrics = compute_metrics(report, report_style)
metrics_score = self._compute_metrics_score(metrics, report_style)
llm_eval = None
if self.use_llm and self.llm_judge:
try:
llm_eval = await self.llm_judge.evaluate(report, query, report_style)
except Exception as e:
logger.warning(f"LLM evaluation failed, using metrics only: {e}")
if llm_eval and llm_eval.overall_score > 0:
final_score = (metrics_score * 0.4) + (llm_eval.weighted_score * 0.6)
else:
final_score = metrics_score
final_score = round(final_score, 2)
grade = score_to_grade(final_score)
summary = self._generate_summary(metrics, llm_eval, final_score, grade)
return CombinedEvaluation(
metrics=metrics,
llm_evaluation=llm_eval,
final_score=final_score,
grade=grade,
summary=summary,
)
def evaluate_sync(
self,
report: str,
query: str,
report_style: str = "default",
) -> CombinedEvaluation:
"""Synchronous version of evaluate."""
import asyncio
return asyncio.run(self.evaluate(report, query, report_style))
def evaluate_metrics_only(
self,
report: str,
report_style: str = "default",
) -> Dict[str, Any]:
"""
Quick evaluation using only automated metrics (no LLM).
Args:
report: The report text to evaluate
report_style: The style of report
Returns:
Dictionary with metrics and score
"""
metrics = compute_metrics(report, report_style)
metrics_score = self._compute_metrics_score(metrics, report_style)
grade = score_to_grade(metrics_score)
return {
"metrics": metrics.to_dict(),
"score": metrics_score,
"grade": grade,
}

282
src/eval/llm_judge.py Normal file
View File

@@ -0,0 +1,282 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""
LLM-as-Judge evaluation for report quality.
Uses an LLM to evaluate reports on multiple quality dimensions,
providing more nuanced assessment than automated metrics alone.
"""
import json
import logging
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from langchain_core.messages import HumanMessage, SystemMessage
logger = logging.getLogger(__name__)
# Maximum characters of report content to send to the LLM for evaluation.
# This limit prevents exceeding LLM context windows and controls token usage.
MAX_REPORT_LENGTH = 15000
EVALUATION_CRITERIA = {
"factual_accuracy": {
"description": "Are claims supported by cited sources? Is information accurate and verifiable?",
"weight": 0.25,
},
"completeness": {
"description": "Does the report comprehensively cover all aspects of the topic?",
"weight": 0.20,
},
"coherence": {
"description": "Is the report logically structured, well-organized, and easy to follow?",
"weight": 0.20,
},
"relevance": {
"description": "Does the content directly address the research question without unnecessary tangents?",
"weight": 0.15,
},
"citation_quality": {
"description": "Are sources credible, diverse, and properly cited?",
"weight": 0.10,
},
"writing_quality": {
"description": "Is the writing clear, professional, and appropriate for the target audience?",
"weight": 0.10,
},
}
JUDGE_SYSTEM_PROMPT = """You are an expert report quality evaluator. Your task is to objectively assess the quality of research reports.
Evaluate the report on the following criteria, scoring each from 1-10:
1. **Factual Accuracy** (1-10): Are claims supported by cited sources? Is information accurate?
2. **Completeness** (1-10): Does the report cover all aspects of the topic comprehensively?
3. **Coherence** (1-10): Is the report logically structured and easy to follow?
4. **Relevance** (1-10): Does content directly address the research question?
5. **Citation Quality** (1-10): Are sources credible, diverse, and properly cited?
6. **Writing Quality** (1-10): Is the writing clear and appropriate for the audience?
Respond ONLY with a valid JSON object in this exact format:
{
"scores": {
"factual_accuracy": <1-10>,
"completeness": <1-10>,
"coherence": <1-10>,
"relevance": <1-10>,
"citation_quality": <1-10>,
"writing_quality": <1-10>
},
"overall_score": <1-10>,
"strengths": ["strength1", "strength2"],
"weaknesses": ["weakness1", "weakness2"],
"suggestions": ["suggestion1", "suggestion2"]
}
Be objective and thorough in your evaluation."""
@dataclass
class EvaluationResult:
"""Container for LLM evaluation results."""
scores: Dict[str, int]
overall_score: float
weighted_score: float
strengths: List[str]
weaknesses: List[str]
suggestions: List[str]
raw_response: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
"""Convert evaluation result to dictionary."""
return {
"scores": self.scores,
"overall_score": self.overall_score,
"weighted_score": self.weighted_score,
"strengths": self.strengths,
"weaknesses": self.weaknesses,
"suggestions": self.suggestions,
}
class LLMJudge:
"""LLM-based report quality evaluator."""
def __init__(self, llm: Any = None):
"""
Initialize the LLM Judge.
Args:
llm: LangChain-compatible LLM instance. If None, will be created on demand.
"""
self._llm = llm
def _get_llm(self):
"""Get or create the LLM instance."""
if self._llm is None:
from src.llms.llm import get_llm_by_type
self._llm = get_llm_by_type("basic")
return self._llm
def _calculate_weighted_score(self, scores: Dict[str, int]) -> float:
"""Calculate weighted average score based on criteria weights."""
total_weight = 0
weighted_sum = 0
for criterion, score in scores.items():
if criterion in EVALUATION_CRITERIA:
weight = EVALUATION_CRITERIA[criterion]["weight"]
weighted_sum += score * weight
total_weight += weight
if total_weight > 0:
return round(weighted_sum / total_weight, 2)
return 0.0
def _parse_response(self, response: str) -> Dict[str, Any]:
"""Parse LLM response into structured format."""
try:
json_match = response
if "```json" in response:
json_match = response.split("```json")[1].split("```")[0]
elif "```" in response:
json_match = response.split("```")[1].split("```")[0]
return json.loads(json_match.strip())
except (json.JSONDecodeError, IndexError) as e:
logger.warning(f"Failed to parse LLM response: {e}")
return {
"scores": {
"factual_accuracy": 5,
"completeness": 5,
"coherence": 5,
"relevance": 5,
"citation_quality": 5,
"writing_quality": 5,
},
"overall_score": 5,
"strengths": ["Unable to parse evaluation"],
"weaknesses": ["Evaluation parsing failed"],
"suggestions": ["Please re-run evaluation"],
}
async def evaluate(
self,
report: str,
query: str,
report_style: str = "default",
) -> EvaluationResult:
"""
Evaluate a report using LLM-as-Judge.
Args:
report: The report text to evaluate
query: The original research query
report_style: The style of report for context
Returns:
EvaluationResult with scores and feedback
"""
llm = self._get_llm()
user_prompt = f"""Please evaluate the following research report.
**Original Research Query:** {query}
**Report Style:** {report_style}
**Report to Evaluate:**
{report[:MAX_REPORT_LENGTH]}
Provide your evaluation in the specified JSON format."""
messages = [
SystemMessage(content=JUDGE_SYSTEM_PROMPT),
HumanMessage(content=user_prompt),
]
try:
response = await llm.ainvoke(messages)
response_text = (
response.content if hasattr(response, "content") else str(response)
)
parsed = self._parse_response(response_text)
scores = parsed.get("scores", {})
weighted_score = self._calculate_weighted_score(scores)
return EvaluationResult(
scores=scores,
overall_score=parsed.get("overall_score", 5),
weighted_score=weighted_score,
strengths=parsed.get("strengths", []),
weaknesses=parsed.get("weaknesses", []),
suggestions=parsed.get("suggestions", []),
raw_response=response_text,
)
except Exception as e:
logger.error(f"LLM evaluation failed: {e}")
return EvaluationResult(
scores={
"factual_accuracy": 0,
"completeness": 0,
"coherence": 0,
"relevance": 0,
"citation_quality": 0,
"writing_quality": 0,
},
overall_score=0,
weighted_score=0,
strengths=[],
weaknesses=[f"Evaluation failed: {str(e)}"],
suggestions=["Please retry evaluation"],
)
def evaluate_sync(
self,
report: str,
query: str,
report_style: str = "default",
) -> EvaluationResult:
"""
Synchronous version of evaluate.
Args:
report: The report text to evaluate
query: The original research query
report_style: The style of report for context
Returns:
EvaluationResult with scores and feedback
"""
import asyncio
return asyncio.run(self.evaluate(report, query, report_style))
async def evaluate_with_llm(
report: str,
query: str,
report_style: str = "default",
llm: Any = None,
) -> EvaluationResult:
"""
Convenience function to evaluate a report with LLM.
Args:
report: The report text to evaluate
query: The original research query
report_style: The style of report for context
llm: Optional LLM instance to use
Returns:
EvaluationResult with scores and feedback
"""
judge = LLMJudge(llm=llm)
return await judge.evaluate(report, query, report_style)

229
src/eval/metrics.py Normal file
View File

@@ -0,0 +1,229 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""
Automated metrics for report quality evaluation.
These metrics can be computed without LLM calls, providing fast and
deterministic quality assessment.
"""
import re
from dataclasses import dataclass, field
from typing import Dict, List, Optional
from urllib.parse import urlparse
@dataclass
class ReportMetrics:
"""Container for computed report metrics."""
word_count: int = 0
citation_count: int = 0
unique_sources: int = 0
image_count: int = 0
section_count: int = 0
sections_found: List[str] = field(default_factory=list)
sections_missing: List[str] = field(default_factory=list)
section_coverage_score: float = 0.0
has_title: bool = False
has_key_points: bool = False
has_overview: bool = False
has_citations_section: bool = False
def to_dict(self) -> Dict:
"""Convert metrics to dictionary."""
return {
"word_count": self.word_count,
"citation_count": self.citation_count,
"unique_sources": self.unique_sources,
"image_count": self.image_count,
"section_count": self.section_count,
"sections_found": self.sections_found,
"sections_missing": self.sections_missing,
"section_coverage_score": self.section_coverage_score,
"has_title": self.has_title,
"has_key_points": self.has_key_points,
"has_overview": self.has_overview,
"has_citations_section": self.has_citations_section,
}
# Required sections for different report styles
REPORT_STYLE_SECTIONS = {
"default": [
"title",
"key_points",
"overview",
"detailed_analysis",
"key_citations",
],
"academic": [
"title",
"key_points",
"overview",
"detailed_analysis",
"literature_review",
"methodology",
"key_citations",
],
"news": [
"title",
"key_points",
"overview",
"detailed_analysis",
"key_citations",
],
"popular_science": [
"title",
"key_points",
"overview",
"detailed_analysis",
"key_citations",
],
"social_media": [
"title",
"key_points",
"overview",
"key_citations",
],
"strategic_investment": [
"title",
"key_points",
"overview",
"detailed_analysis",
"executive_summary",
"market_analysis",
"technology_analysis",
"investment_recommendations",
"key_citations",
],
}
# Section name patterns for detection (supports both English and Chinese)
SECTION_PATTERNS = {
"title": r"^#\s+.+",
"key_points": r"(?:key\s*points|要点|关键发现|核心观点)",
"overview": r"(?:overview|概述|简介|背景)",
"detailed_analysis": r"(?:detailed\s*analysis|详细分析|深度分析|分析)",
"key_citations": r"(?:key\s*citations|references|参考文献|引用|来源)",
"literature_review": r"(?:literature\s*review|文献综述|研究回顾)",
"methodology": r"(?:methodology|方法论|研究方法)",
"executive_summary": r"(?:executive\s*summary|执行摘要|投资建议)",
"market_analysis": r"(?:market\s*analysis|市场分析|产业分析)",
"technology_analysis": r"(?:technology|技术.*(?:分析|解析|深度))",
"investment_recommendations": r"(?:investment.*recommend|投资建议|投资评级)",
}
def count_words(text: str) -> int:
"""Count words in text, handling both English and Chinese."""
english_words = len(re.findall(r"\b[a-zA-Z]+\b", text))
chinese_chars = len(re.findall(r"[\u4e00-\u9fff]", text))
return english_words + chinese_chars
def count_citations(text: str) -> int:
"""Count markdown-style citations [text](url)."""
pattern = r"\[.+?\]\(https?://[^\s\)]+\)"
return len(re.findall(pattern, text))
def extract_domains(text: str) -> List[str]:
"""Extract unique domains from URLs in the text."""
url_pattern = r"https?://([^\s\)\]]+)"
urls = re.findall(url_pattern, text)
domains = set()
for url in urls:
try:
parsed = urlparse(f"http://{url}")
domain = parsed.netloc or url.split("/")[0]
domain = domain.lower().replace("www.", "")
if domain:
domains.add(domain)
except Exception:
continue
return list(domains)
def count_images(text: str) -> int:
"""Count markdown images ![alt](url)."""
pattern = r"!\[.*?\]\(.+?\)"
return len(re.findall(pattern, text))
def detect_sections(text: str, report_style: str = "default") -> Dict[str, bool]:
"""Detect which sections are present in the report."""
required_sections = REPORT_STYLE_SECTIONS.get(
report_style, REPORT_STYLE_SECTIONS["default"]
)
detected = {}
text_lower = text.lower()
for section in required_sections:
pattern = SECTION_PATTERNS.get(section, section.replace("_", r"\s*"))
if section == "title":
detected[section] = bool(re.search(pattern, text, re.MULTILINE))
else:
detected[section] = bool(
re.search(pattern, text_lower, re.IGNORECASE | re.MULTILINE)
)
return detected
def compute_metrics(
report: str, report_style: str = "default", target_word_count: Optional[int] = None
) -> ReportMetrics:
"""
Compute automated metrics for a report.
Args:
report: The report text in markdown format
report_style: The style of report (academic, news, etc.)
target_word_count: Optional target word count for compliance check
Returns:
ReportMetrics object with computed values
"""
metrics = ReportMetrics()
metrics.word_count = count_words(report)
metrics.citation_count = count_citations(report)
domains = extract_domains(report)
metrics.unique_sources = len(domains)
metrics.image_count = count_images(report)
sections_detected = detect_sections(report, report_style)
metrics.sections_found = [s for s, found in sections_detected.items() if found]
metrics.sections_missing = [
s for s, found in sections_detected.items() if not found
]
metrics.section_count = len(metrics.sections_found)
total_sections = len(sections_detected)
if total_sections > 0:
metrics.section_coverage_score = len(metrics.sections_found) / total_sections
metrics.has_title = sections_detected.get("title", False)
metrics.has_key_points = sections_detected.get("key_points", False)
metrics.has_overview = sections_detected.get("overview", False)
metrics.has_citations_section = sections_detected.get("key_citations", False)
return metrics
def get_word_count_target(report_style: str) -> Dict[str, int]:
"""Get target word count range for a report style."""
targets = {
"strategic_investment": {"min": 10000, "max": 15000},
"academic": {"min": 3000, "max": 8000},
"news": {"min": 800, "max": 2000},
"popular_science": {"min": 1500, "max": 4000},
"social_media": {"min": 500, "max": 1500},
"default": {"min": 1000, "max": 5000},
}
return targets.get(report_style, targets["default"])

View File

@@ -35,6 +35,7 @@ from src.podcast.graph.builder import build_graph as build_podcast_graph
from src.ppt.graph.builder import build_graph as build_ppt_graph
from src.prompt_enhancer.graph.builder import build_graph as build_prompt_enhancer_graph
from src.prose.graph.builder import build_graph as build_prose_graph
from src.eval import ReportEvaluator
from src.rag.builder import build_retriever
from src.rag.milvus import load_examples as load_milvus_examples
from src.rag.qdrant import load_examples as load_qdrant_examples
@@ -47,6 +48,7 @@ from src.server.chat_request import (
GenerateProseRequest,
TTSRequest,
)
from src.server.eval_request import EvaluateReportRequest, EvaluateReportResponse
from src.server.config_request import ConfigResponse
from src.server.mcp_request import MCPServerMetadataRequest, MCPServerMetadataResponse
from src.server.mcp_utils import load_mcp_tools
@@ -946,6 +948,39 @@ async def generate_prose(request: GenerateProseRequest):
raise HTTPException(status_code=500, detail=INTERNAL_SERVER_ERROR_DETAIL)
@app.post("/api/report/evaluate", response_model=EvaluateReportResponse)
async def evaluate_report(request: EvaluateReportRequest):
"""Evaluate report quality using automated metrics and optionally LLM-as-Judge."""
try:
evaluator = ReportEvaluator(use_llm=request.use_llm)
if request.use_llm:
result = await evaluator.evaluate(
request.content, request.query, request.report_style or "default"
)
return EvaluateReportResponse(
metrics=result.metrics.to_dict(),
score=result.final_score,
grade=result.grade,
llm_evaluation=result.llm_evaluation.to_dict()
if result.llm_evaluation
else None,
summary=result.summary,
)
else:
result = evaluator.evaluate_metrics_only(
request.content, request.report_style or "default"
)
return EvaluateReportResponse(
metrics=result["metrics"],
score=result["score"],
grade=result["grade"],
)
except Exception as e:
logger.exception(f"Error occurred during report evaluation: {str(e)}")
raise HTTPException(status_code=500, detail=INTERNAL_SERVER_ERROR_DETAIL)
@app.post("/api/prompt/enhance")
async def enhance_prompt(request: EnhancePromptRequest):
try:

View File

@@ -0,0 +1,71 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""Request models for report evaluation endpoint."""
from typing import Optional
from pydantic import BaseModel, Field
class EvaluateReportRequest(BaseModel):
"""Request model for report evaluation."""
content: str = Field(description="Report markdown content to evaluate")
query: str = Field(description="Original research query")
report_style: Optional[str] = Field(
default="default", description="Report style (academic, news, etc.)"
)
use_llm: bool = Field(
default=False,
description="Whether to use LLM for deep evaluation (slower but more detailed)",
)
class EvaluationMetrics(BaseModel):
"""Automated metrics result."""
word_count: int
citation_count: int
unique_sources: int
image_count: int
section_count: int
section_coverage_score: float
sections_found: list[str]
sections_missing: list[str]
has_title: bool
has_key_points: bool
has_overview: bool
has_citations_section: bool
class LLMEvaluationScores(BaseModel):
"""LLM evaluation scores."""
factual_accuracy: int = 0
completeness: int = 0
coherence: int = 0
relevance: int = 0
citation_quality: int = 0
writing_quality: int = 0
class LLMEvaluation(BaseModel):
"""LLM evaluation result."""
scores: LLMEvaluationScores
overall_score: float
weighted_score: float
strengths: list[str]
weaknesses: list[str]
suggestions: list[str]
class EvaluateReportResponse(BaseModel):
"""Response model for report evaluation."""
metrics: EvaluationMetrics
score: float
grade: str
llm_evaluation: Optional[LLMEvaluation] = None
summary: Optional[str] = None