diff --git a/src/eval/__init__.py b/src/eval/__init__.py new file mode 100644 index 0000000..9b973b8 --- /dev/null +++ b/src/eval/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +""" +Report Quality Evaluation Module for DeerFlow. + +This module provides objective methods to evaluate generated report quality, +including automated metrics and LLM-based evaluation. +""" + +from .evaluator import ReportEvaluator +from .metrics import ReportMetrics, compute_metrics +from .llm_judge import LLMJudge, evaluate_with_llm + +__all__ = [ + "ReportEvaluator", + "ReportMetrics", + "compute_metrics", + "LLMJudge", + "evaluate_with_llm", +] diff --git a/src/eval/evaluator.py b/src/eval/evaluator.py new file mode 100644 index 0000000..de376ee --- /dev/null +++ b/src/eval/evaluator.py @@ -0,0 +1,249 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +""" +Combined report evaluator orchestrating both automated metrics and LLM evaluation. +""" + +import logging +from dataclasses import dataclass +from typing import Any, Dict, Optional + +from .llm_judge import EvaluationResult, LLMJudge +from .metrics import ReportMetrics, compute_metrics, get_word_count_target + +logger = logging.getLogger(__name__) + + +@dataclass +class CombinedEvaluation: + """Combined evaluation results from metrics and LLM judge.""" + + metrics: ReportMetrics + llm_evaluation: Optional[EvaluationResult] + final_score: float + grade: str + summary: str + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary format.""" + return { + "metrics": self.metrics.to_dict(), + "llm_evaluation": ( + self.llm_evaluation.to_dict() if self.llm_evaluation else None + ), + "final_score": self.final_score, + "grade": self.grade, + "summary": self.summary, + } + + +def score_to_grade(score: float) -> str: + """Convert numeric score to letter grade.""" + if score >= 9.0: + return "A+" + elif score >= 8.5: + return "A" + elif score >= 8.0: + return "A-" + elif score >= 7.5: + return "B+" + elif score >= 7.0: + return "B" + elif score >= 6.5: + return "B-" + elif score >= 6.0: + return "C+" + elif score >= 5.5: + return "C" + elif score >= 5.0: + return "C-" + elif score >= 4.0: + return "D" + else: + return "F" + + +class ReportEvaluator: + """ + Combined report evaluator using both automated metrics and LLM-as-Judge. + + This evaluator provides comprehensive report quality assessment by: + 1. Computing automated metrics (fast, deterministic) + 2. Running LLM-based evaluation (nuanced, contextual) + 3. Combining both for a final score and grade + """ + + def __init__(self, llm: Any = None, use_llm: bool = True): + """ + Initialize the evaluator. + + Args: + llm: Optional LLM instance for LLM-as-Judge evaluation + use_llm: Whether to use LLM evaluation (can be disabled for speed) + """ + self.use_llm = use_llm + self.llm_judge = LLMJudge(llm=llm) if use_llm else None + + def _compute_metrics_score( + self, metrics: ReportMetrics, report_style: str + ) -> float: + """ + Convert automated metrics to a 0-10 score. + + Scoring breakdown: + - Section coverage: 30% + - Citation quality: 25% + - Word count compliance: 20% + - Source diversity: 15% + - Image inclusion: 10% + """ + score = 0.0 + + section_score = metrics.section_coverage_score * 10 + score += section_score * 0.30 + + citation_score = min(metrics.citation_count / 10, 1.0) * 10 + score += citation_score * 0.25 + + target = get_word_count_target(report_style) + if target: + if target["min"] <= metrics.word_count <= target["max"]: + word_score = 10.0 + elif metrics.word_count < target["min"]: + word_score = (metrics.word_count / target["min"]) * 8 + else: + excess_ratio = metrics.word_count / target["max"] + word_score = max(10 - (excess_ratio - 1) * 5, 5) + score += word_score * 0.20 + + diversity_score = min(metrics.unique_sources / 5, 1.0) * 10 + score += diversity_score * 0.15 + + image_score = min(metrics.image_count / 3, 1.0) * 10 + score += image_score * 0.10 + + return round(score, 2) + + def _generate_summary( + self, + metrics: ReportMetrics, + llm_eval: Optional[EvaluationResult], + final_score: float, + grade: str, + ) -> str: + """Generate a human-readable evaluation summary.""" + lines = [f"Report Grade: {grade} ({final_score}/10)", ""] + + lines.append("**Automated Metrics:**") + lines.append(f"- Word Count: {metrics.word_count}") + lines.append(f"- Citations: {metrics.citation_count}") + lines.append(f"- Unique Sources: {metrics.unique_sources}") + lines.append(f"- Images: {metrics.image_count}") + lines.append( + f"- Section Coverage: {metrics.section_coverage_score * 100:.0f}%" + ) + + if metrics.sections_missing: + lines.append(f"- Missing Sections: {', '.join(metrics.sections_missing)}") + + if llm_eval: + lines.append("") + lines.append("**LLM Evaluation:**") + for criterion, score in llm_eval.scores.items(): + lines.append(f"- {criterion.replace('_', ' ').title()}: {score}/10") + + if llm_eval.strengths: + lines.append("") + lines.append("**Strengths:**") + for strength in llm_eval.strengths[:3]: + lines.append(f"- {strength}") + + if llm_eval.weaknesses: + lines.append("") + lines.append("**Areas for Improvement:**") + for weakness in llm_eval.weaknesses[:3]: + lines.append(f"- {weakness}") + + return "\n".join(lines) + + async def evaluate( + self, + report: str, + query: str, + report_style: str = "default", + ) -> CombinedEvaluation: + """ + Evaluate a report using both metrics and LLM. + + Args: + report: The report text to evaluate + query: The original research query + report_style: The style of report + + Returns: + CombinedEvaluation with full results + """ + metrics = compute_metrics(report, report_style) + metrics_score = self._compute_metrics_score(metrics, report_style) + + llm_eval = None + if self.use_llm and self.llm_judge: + try: + llm_eval = await self.llm_judge.evaluate(report, query, report_style) + except Exception as e: + logger.warning(f"LLM evaluation failed, using metrics only: {e}") + + if llm_eval and llm_eval.overall_score > 0: + final_score = (metrics_score * 0.4) + (llm_eval.weighted_score * 0.6) + else: + final_score = metrics_score + + final_score = round(final_score, 2) + grade = score_to_grade(final_score) + + summary = self._generate_summary(metrics, llm_eval, final_score, grade) + + return CombinedEvaluation( + metrics=metrics, + llm_evaluation=llm_eval, + final_score=final_score, + grade=grade, + summary=summary, + ) + + def evaluate_sync( + self, + report: str, + query: str, + report_style: str = "default", + ) -> CombinedEvaluation: + """Synchronous version of evaluate.""" + import asyncio + + return asyncio.run(self.evaluate(report, query, report_style)) + + def evaluate_metrics_only( + self, + report: str, + report_style: str = "default", + ) -> Dict[str, Any]: + """ + Quick evaluation using only automated metrics (no LLM). + + Args: + report: The report text to evaluate + report_style: The style of report + + Returns: + Dictionary with metrics and score + """ + metrics = compute_metrics(report, report_style) + metrics_score = self._compute_metrics_score(metrics, report_style) + grade = score_to_grade(metrics_score) + + return { + "metrics": metrics.to_dict(), + "score": metrics_score, + "grade": grade, + } diff --git a/src/eval/llm_judge.py b/src/eval/llm_judge.py new file mode 100644 index 0000000..e0df521 --- /dev/null +++ b/src/eval/llm_judge.py @@ -0,0 +1,282 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +""" +LLM-as-Judge evaluation for report quality. + +Uses an LLM to evaluate reports on multiple quality dimensions, +providing more nuanced assessment than automated metrics alone. +""" + +import json +import logging +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +from langchain_core.messages import HumanMessage, SystemMessage + +logger = logging.getLogger(__name__) + +# Maximum characters of report content to send to the LLM for evaluation. +# This limit prevents exceeding LLM context windows and controls token usage. +MAX_REPORT_LENGTH = 15000 + +EVALUATION_CRITERIA = { + "factual_accuracy": { + "description": "Are claims supported by cited sources? Is information accurate and verifiable?", + "weight": 0.25, + }, + "completeness": { + "description": "Does the report comprehensively cover all aspects of the topic?", + "weight": 0.20, + }, + "coherence": { + "description": "Is the report logically structured, well-organized, and easy to follow?", + "weight": 0.20, + }, + "relevance": { + "description": "Does the content directly address the research question without unnecessary tangents?", + "weight": 0.15, + }, + "citation_quality": { + "description": "Are sources credible, diverse, and properly cited?", + "weight": 0.10, + }, + "writing_quality": { + "description": "Is the writing clear, professional, and appropriate for the target audience?", + "weight": 0.10, + }, +} + +JUDGE_SYSTEM_PROMPT = """You are an expert report quality evaluator. Your task is to objectively assess the quality of research reports. + +Evaluate the report on the following criteria, scoring each from 1-10: + +1. **Factual Accuracy** (1-10): Are claims supported by cited sources? Is information accurate? +2. **Completeness** (1-10): Does the report cover all aspects of the topic comprehensively? +3. **Coherence** (1-10): Is the report logically structured and easy to follow? +4. **Relevance** (1-10): Does content directly address the research question? +5. **Citation Quality** (1-10): Are sources credible, diverse, and properly cited? +6. **Writing Quality** (1-10): Is the writing clear and appropriate for the audience? + +Respond ONLY with a valid JSON object in this exact format: +{ + "scores": { + "factual_accuracy": <1-10>, + "completeness": <1-10>, + "coherence": <1-10>, + "relevance": <1-10>, + "citation_quality": <1-10>, + "writing_quality": <1-10> + }, + "overall_score": <1-10>, + "strengths": ["strength1", "strength2"], + "weaknesses": ["weakness1", "weakness2"], + "suggestions": ["suggestion1", "suggestion2"] +} + +Be objective and thorough in your evaluation.""" + + +@dataclass +class EvaluationResult: + """Container for LLM evaluation results.""" + + scores: Dict[str, int] + overall_score: float + weighted_score: float + strengths: List[str] + weaknesses: List[str] + suggestions: List[str] + raw_response: Optional[str] = None + + def to_dict(self) -> Dict[str, Any]: + """Convert evaluation result to dictionary.""" + return { + "scores": self.scores, + "overall_score": self.overall_score, + "weighted_score": self.weighted_score, + "strengths": self.strengths, + "weaknesses": self.weaknesses, + "suggestions": self.suggestions, + } + + +class LLMJudge: + """LLM-based report quality evaluator.""" + + def __init__(self, llm: Any = None): + """ + Initialize the LLM Judge. + + Args: + llm: LangChain-compatible LLM instance. If None, will be created on demand. + """ + self._llm = llm + + def _get_llm(self): + """Get or create the LLM instance.""" + if self._llm is None: + from src.llms.llm import get_llm_by_type + + self._llm = get_llm_by_type("basic") + return self._llm + + def _calculate_weighted_score(self, scores: Dict[str, int]) -> float: + """Calculate weighted average score based on criteria weights.""" + total_weight = 0 + weighted_sum = 0 + + for criterion, score in scores.items(): + if criterion in EVALUATION_CRITERIA: + weight = EVALUATION_CRITERIA[criterion]["weight"] + weighted_sum += score * weight + total_weight += weight + + if total_weight > 0: + return round(weighted_sum / total_weight, 2) + return 0.0 + + def _parse_response(self, response: str) -> Dict[str, Any]: + """Parse LLM response into structured format.""" + try: + json_match = response + if "```json" in response: + json_match = response.split("```json")[1].split("```")[0] + elif "```" in response: + json_match = response.split("```")[1].split("```")[0] + + return json.loads(json_match.strip()) + except (json.JSONDecodeError, IndexError) as e: + logger.warning(f"Failed to parse LLM response: {e}") + return { + "scores": { + "factual_accuracy": 5, + "completeness": 5, + "coherence": 5, + "relevance": 5, + "citation_quality": 5, + "writing_quality": 5, + }, + "overall_score": 5, + "strengths": ["Unable to parse evaluation"], + "weaknesses": ["Evaluation parsing failed"], + "suggestions": ["Please re-run evaluation"], + } + + async def evaluate( + self, + report: str, + query: str, + report_style: str = "default", + ) -> EvaluationResult: + """ + Evaluate a report using LLM-as-Judge. + + Args: + report: The report text to evaluate + query: The original research query + report_style: The style of report for context + + Returns: + EvaluationResult with scores and feedback + """ + llm = self._get_llm() + + user_prompt = f"""Please evaluate the following research report. + +**Original Research Query:** {query} + +**Report Style:** {report_style} + +**Report to Evaluate:** +{report[:MAX_REPORT_LENGTH]} + +Provide your evaluation in the specified JSON format.""" + + messages = [ + SystemMessage(content=JUDGE_SYSTEM_PROMPT), + HumanMessage(content=user_prompt), + ] + + try: + response = await llm.ainvoke(messages) + response_text = ( + response.content if hasattr(response, "content") else str(response) + ) + + parsed = self._parse_response(response_text) + + scores = parsed.get("scores", {}) + weighted_score = self._calculate_weighted_score(scores) + + return EvaluationResult( + scores=scores, + overall_score=parsed.get("overall_score", 5), + weighted_score=weighted_score, + strengths=parsed.get("strengths", []), + weaknesses=parsed.get("weaknesses", []), + suggestions=parsed.get("suggestions", []), + raw_response=response_text, + ) + + except Exception as e: + logger.error(f"LLM evaluation failed: {e}") + return EvaluationResult( + scores={ + "factual_accuracy": 0, + "completeness": 0, + "coherence": 0, + "relevance": 0, + "citation_quality": 0, + "writing_quality": 0, + }, + overall_score=0, + weighted_score=0, + strengths=[], + weaknesses=[f"Evaluation failed: {str(e)}"], + suggestions=["Please retry evaluation"], + ) + + def evaluate_sync( + self, + report: str, + query: str, + report_style: str = "default", + ) -> EvaluationResult: + """ + Synchronous version of evaluate. + + Args: + report: The report text to evaluate + query: The original research query + report_style: The style of report for context + + Returns: + EvaluationResult with scores and feedback + """ + import asyncio + + return asyncio.run(self.evaluate(report, query, report_style)) + + +async def evaluate_with_llm( + report: str, + query: str, + report_style: str = "default", + llm: Any = None, +) -> EvaluationResult: + """ + Convenience function to evaluate a report with LLM. + + Args: + report: The report text to evaluate + query: The original research query + report_style: The style of report for context + llm: Optional LLM instance to use + + Returns: + EvaluationResult with scores and feedback + """ + judge = LLMJudge(llm=llm) + return await judge.evaluate(report, query, report_style) diff --git a/src/eval/metrics.py b/src/eval/metrics.py new file mode 100644 index 0000000..3b61b70 --- /dev/null +++ b/src/eval/metrics.py @@ -0,0 +1,229 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +""" +Automated metrics for report quality evaluation. + +These metrics can be computed without LLM calls, providing fast and +deterministic quality assessment. +""" + +import re +from dataclasses import dataclass, field +from typing import Dict, List, Optional +from urllib.parse import urlparse + + +@dataclass +class ReportMetrics: + """Container for computed report metrics.""" + + word_count: int = 0 + citation_count: int = 0 + unique_sources: int = 0 + image_count: int = 0 + section_count: int = 0 + sections_found: List[str] = field(default_factory=list) + sections_missing: List[str] = field(default_factory=list) + section_coverage_score: float = 0.0 + has_title: bool = False + has_key_points: bool = False + has_overview: bool = False + has_citations_section: bool = False + + def to_dict(self) -> Dict: + """Convert metrics to dictionary.""" + return { + "word_count": self.word_count, + "citation_count": self.citation_count, + "unique_sources": self.unique_sources, + "image_count": self.image_count, + "section_count": self.section_count, + "sections_found": self.sections_found, + "sections_missing": self.sections_missing, + "section_coverage_score": self.section_coverage_score, + "has_title": self.has_title, + "has_key_points": self.has_key_points, + "has_overview": self.has_overview, + "has_citations_section": self.has_citations_section, + } + + +# Required sections for different report styles +REPORT_STYLE_SECTIONS = { + "default": [ + "title", + "key_points", + "overview", + "detailed_analysis", + "key_citations", + ], + "academic": [ + "title", + "key_points", + "overview", + "detailed_analysis", + "literature_review", + "methodology", + "key_citations", + ], + "news": [ + "title", + "key_points", + "overview", + "detailed_analysis", + "key_citations", + ], + "popular_science": [ + "title", + "key_points", + "overview", + "detailed_analysis", + "key_citations", + ], + "social_media": [ + "title", + "key_points", + "overview", + "key_citations", + ], + "strategic_investment": [ + "title", + "key_points", + "overview", + "detailed_analysis", + "executive_summary", + "market_analysis", + "technology_analysis", + "investment_recommendations", + "key_citations", + ], +} + +# Section name patterns for detection (supports both English and Chinese) +SECTION_PATTERNS = { + "title": r"^#\s+.+", + "key_points": r"(?:key\s*points|要点|关键发现|核心观点)", + "overview": r"(?:overview|概述|简介|背景)", + "detailed_analysis": r"(?:detailed\s*analysis|详细分析|深度分析|分析)", + "key_citations": r"(?:key\s*citations|references|参考文献|引用|来源)", + "literature_review": r"(?:literature\s*review|文献综述|研究回顾)", + "methodology": r"(?:methodology|方法论|研究方法)", + "executive_summary": r"(?:executive\s*summary|执行摘要|投资建议)", + "market_analysis": r"(?:market\s*analysis|市场分析|产业分析)", + "technology_analysis": r"(?:technology|技术.*(?:分析|解析|深度))", + "investment_recommendations": r"(?:investment.*recommend|投资建议|投资评级)", +} + + +def count_words(text: str) -> int: + """Count words in text, handling both English and Chinese.""" + english_words = len(re.findall(r"\b[a-zA-Z]+\b", text)) + chinese_chars = len(re.findall(r"[\u4e00-\u9fff]", text)) + return english_words + chinese_chars + + +def count_citations(text: str) -> int: + """Count markdown-style citations [text](url).""" + pattern = r"\[.+?\]\(https?://[^\s\)]+\)" + return len(re.findall(pattern, text)) + + +def extract_domains(text: str) -> List[str]: + """Extract unique domains from URLs in the text.""" + url_pattern = r"https?://([^\s\)\]]+)" + urls = re.findall(url_pattern, text) + domains = set() + for url in urls: + try: + parsed = urlparse(f"http://{url}") + domain = parsed.netloc or url.split("/")[0] + domain = domain.lower().replace("www.", "") + if domain: + domains.add(domain) + except Exception: + continue + return list(domains) + + +def count_images(text: str) -> int: + """Count markdown images ![alt](url).""" + pattern = r"!\[.*?\]\(.+?\)" + return len(re.findall(pattern, text)) + + +def detect_sections(text: str, report_style: str = "default") -> Dict[str, bool]: + """Detect which sections are present in the report.""" + required_sections = REPORT_STYLE_SECTIONS.get( + report_style, REPORT_STYLE_SECTIONS["default"] + ) + detected = {} + + text_lower = text.lower() + + for section in required_sections: + pattern = SECTION_PATTERNS.get(section, section.replace("_", r"\s*")) + if section == "title": + detected[section] = bool(re.search(pattern, text, re.MULTILINE)) + else: + detected[section] = bool( + re.search(pattern, text_lower, re.IGNORECASE | re.MULTILINE) + ) + + return detected + + +def compute_metrics( + report: str, report_style: str = "default", target_word_count: Optional[int] = None +) -> ReportMetrics: + """ + Compute automated metrics for a report. + + Args: + report: The report text in markdown format + report_style: The style of report (academic, news, etc.) + target_word_count: Optional target word count for compliance check + + Returns: + ReportMetrics object with computed values + """ + metrics = ReportMetrics() + + metrics.word_count = count_words(report) + metrics.citation_count = count_citations(report) + + domains = extract_domains(report) + metrics.unique_sources = len(domains) + + metrics.image_count = count_images(report) + + sections_detected = detect_sections(report, report_style) + metrics.sections_found = [s for s, found in sections_detected.items() if found] + metrics.sections_missing = [ + s for s, found in sections_detected.items() if not found + ] + metrics.section_count = len(metrics.sections_found) + + total_sections = len(sections_detected) + if total_sections > 0: + metrics.section_coverage_score = len(metrics.sections_found) / total_sections + + metrics.has_title = sections_detected.get("title", False) + metrics.has_key_points = sections_detected.get("key_points", False) + metrics.has_overview = sections_detected.get("overview", False) + metrics.has_citations_section = sections_detected.get("key_citations", False) + + return metrics + + +def get_word_count_target(report_style: str) -> Dict[str, int]: + """Get target word count range for a report style.""" + targets = { + "strategic_investment": {"min": 10000, "max": 15000}, + "academic": {"min": 3000, "max": 8000}, + "news": {"min": 800, "max": 2000}, + "popular_science": {"min": 1500, "max": 4000}, + "social_media": {"min": 500, "max": 1500}, + "default": {"min": 1000, "max": 5000}, + } + return targets.get(report_style, targets["default"]) diff --git a/src/server/app.py b/src/server/app.py index 951d318..bb334b7 100644 --- a/src/server/app.py +++ b/src/server/app.py @@ -35,6 +35,7 @@ from src.podcast.graph.builder import build_graph as build_podcast_graph from src.ppt.graph.builder import build_graph as build_ppt_graph from src.prompt_enhancer.graph.builder import build_graph as build_prompt_enhancer_graph from src.prose.graph.builder import build_graph as build_prose_graph +from src.eval import ReportEvaluator from src.rag.builder import build_retriever from src.rag.milvus import load_examples as load_milvus_examples from src.rag.qdrant import load_examples as load_qdrant_examples @@ -47,6 +48,7 @@ from src.server.chat_request import ( GenerateProseRequest, TTSRequest, ) +from src.server.eval_request import EvaluateReportRequest, EvaluateReportResponse from src.server.config_request import ConfigResponse from src.server.mcp_request import MCPServerMetadataRequest, MCPServerMetadataResponse from src.server.mcp_utils import load_mcp_tools @@ -946,6 +948,39 @@ async def generate_prose(request: GenerateProseRequest): raise HTTPException(status_code=500, detail=INTERNAL_SERVER_ERROR_DETAIL) +@app.post("/api/report/evaluate", response_model=EvaluateReportResponse) +async def evaluate_report(request: EvaluateReportRequest): + """Evaluate report quality using automated metrics and optionally LLM-as-Judge.""" + try: + evaluator = ReportEvaluator(use_llm=request.use_llm) + + if request.use_llm: + result = await evaluator.evaluate( + request.content, request.query, request.report_style or "default" + ) + return EvaluateReportResponse( + metrics=result.metrics.to_dict(), + score=result.final_score, + grade=result.grade, + llm_evaluation=result.llm_evaluation.to_dict() + if result.llm_evaluation + else None, + summary=result.summary, + ) + else: + result = evaluator.evaluate_metrics_only( + request.content, request.report_style or "default" + ) + return EvaluateReportResponse( + metrics=result["metrics"], + score=result["score"], + grade=result["grade"], + ) + except Exception as e: + logger.exception(f"Error occurred during report evaluation: {str(e)}") + raise HTTPException(status_code=500, detail=INTERNAL_SERVER_ERROR_DETAIL) + + @app.post("/api/prompt/enhance") async def enhance_prompt(request: EnhancePromptRequest): try: diff --git a/src/server/eval_request.py b/src/server/eval_request.py new file mode 100644 index 0000000..efd824a --- /dev/null +++ b/src/server/eval_request.py @@ -0,0 +1,71 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +"""Request models for report evaluation endpoint.""" + +from typing import Optional + +from pydantic import BaseModel, Field + + +class EvaluateReportRequest(BaseModel): + """Request model for report evaluation.""" + + content: str = Field(description="Report markdown content to evaluate") + query: str = Field(description="Original research query") + report_style: Optional[str] = Field( + default="default", description="Report style (academic, news, etc.)" + ) + use_llm: bool = Field( + default=False, + description="Whether to use LLM for deep evaluation (slower but more detailed)", + ) + + +class EvaluationMetrics(BaseModel): + """Automated metrics result.""" + + word_count: int + citation_count: int + unique_sources: int + image_count: int + section_count: int + section_coverage_score: float + sections_found: list[str] + sections_missing: list[str] + has_title: bool + has_key_points: bool + has_overview: bool + has_citations_section: bool + + +class LLMEvaluationScores(BaseModel): + """LLM evaluation scores.""" + + factual_accuracy: int = 0 + completeness: int = 0 + coherence: int = 0 + relevance: int = 0 + citation_quality: int = 0 + writing_quality: int = 0 + + +class LLMEvaluation(BaseModel): + """LLM evaluation result.""" + + scores: LLMEvaluationScores + overall_score: float + weighted_score: float + strengths: list[str] + weaknesses: list[str] + suggestions: list[str] + + +class EvaluateReportResponse(BaseModel): + """Response model for report evaluation.""" + + metrics: EvaluationMetrics + score: float + grade: str + llm_evaluation: Optional[LLMEvaluation] = None + summary: Optional[str] = None diff --git a/tests/unit/eval/__init__.py b/tests/unit/eval/__init__.py new file mode 100644 index 0000000..58bc29b --- /dev/null +++ b/tests/unit/eval/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT diff --git a/tests/unit/eval/test_evaluator.py b/tests/unit/eval/test_evaluator.py new file mode 100644 index 0000000..7a9394f --- /dev/null +++ b/tests/unit/eval/test_evaluator.py @@ -0,0 +1,489 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +"""Unit tests for the combined report evaluator.""" + +import json +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from src.eval.evaluator import CombinedEvaluation, ReportEvaluator, score_to_grade +from src.eval.llm_judge import ( + EVALUATION_CRITERIA, + MAX_REPORT_LENGTH, + EvaluationResult, + LLMJudge, +) +from src.eval.metrics import ReportMetrics + + +class TestScoreToGrade: + """Tests for score to grade conversion.""" + + def test_excellent_scores(self): + assert score_to_grade(9.5) == "A+" + assert score_to_grade(9.0) == "A+" + assert score_to_grade(8.7) == "A" + assert score_to_grade(8.5) == "A" + assert score_to_grade(8.2) == "A-" + + def test_good_scores(self): + assert score_to_grade(7.8) == "B+" + assert score_to_grade(7.5) == "B+" + assert score_to_grade(7.2) == "B" + assert score_to_grade(7.0) == "B" + assert score_to_grade(6.7) == "B-" + + def test_average_scores(self): + assert score_to_grade(6.2) == "C+" + assert score_to_grade(5.8) == "C" + assert score_to_grade(5.5) == "C" + assert score_to_grade(5.2) == "C-" + + def test_poor_scores(self): + assert score_to_grade(4.5) == "D" + assert score_to_grade(4.0) == "D" + assert score_to_grade(3.0) == "F" + assert score_to_grade(1.0) == "F" + + +class TestReportEvaluator: + """Tests for ReportEvaluator class.""" + + @pytest.fixture + def evaluator(self): + """Create evaluator without LLM for metrics-only tests.""" + return ReportEvaluator(use_llm=False) + + @pytest.fixture + def sample_report(self): + """Sample report for testing.""" + return """ +# Comprehensive Research Report + +## Key Points +- Important finding number one with significant implications +- Critical discovery that changes our understanding +- Key insight that provides actionable recommendations +- Notable observation from the research data + +## Overview +This report presents a comprehensive analysis of the research topic. +The findings are based on extensive data collection and analysis. + +## Detailed Analysis + +### Section 1: Background +The background of this research involves multiple factors. +[Source 1](https://example.com/source1) provides foundational context. + +### Section 2: Methodology +Our methodology follows established research practices. +[Source 2](https://research.org/methods) outlines the approach. + +### Section 3: Findings +The key findings include several important discoveries. +![Research Data](https://example.com/chart.png) + +[Source 3](https://academic.edu/paper) supports these conclusions. + +## Key Citations +- [Example Source](https://example.com/source1) +- [Research Methods](https://research.org/methods) +- [Academic Paper](https://academic.edu/paper) +- [Additional Reference](https://reference.com/doc) + """ + + def test_evaluate_metrics_only(self, evaluator, sample_report): + """Test metrics-only evaluation.""" + result = evaluator.evaluate_metrics_only(sample_report) + + assert "metrics" in result + assert "score" in result + assert "grade" in result + assert result["score"] > 0 + assert result["grade"] in ["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D", "F"] + + def test_evaluate_metrics_only_structure(self, evaluator, sample_report): + """Test that metrics contain expected fields.""" + result = evaluator.evaluate_metrics_only(sample_report) + metrics = result["metrics"] + + assert "word_count" in metrics + assert "citation_count" in metrics + assert "unique_sources" in metrics + assert "image_count" in metrics + assert "section_coverage_score" in metrics + + def test_evaluate_minimal_report(self, evaluator): + """Test evaluation of minimal report.""" + minimal_report = "Just some text." + result = evaluator.evaluate_metrics_only(minimal_report) + + assert result["score"] < 5.0 + assert result["grade"] in ["D", "F"] + + def test_metrics_score_calculation(self, evaluator): + """Test that metrics score is calculated correctly.""" + good_report = """ +# Title + +## Key Points +- Point 1 +- Point 2 + +## Overview +Overview content here. + +## Detailed Analysis +Analysis with [cite](https://a.com) and [cite2](https://b.com) +and [cite3](https://c.com) and more [refs](https://d.com). + +![Image](https://img.com/1.png) + +## Key Citations +- [A](https://a.com) +- [B](https://b.com) + """ + result = evaluator.evaluate_metrics_only(good_report) + assert result["score"] > 5.0 + + def test_combined_evaluation_to_dict(self): + """Test CombinedEvaluation to_dict method.""" + metrics = ReportMetrics( + word_count=1000, + citation_count=5, + unique_sources=3, + ) + evaluation = CombinedEvaluation( + metrics=metrics, + llm_evaluation=None, + final_score=7.5, + grade="B+", + summary="Test summary", + ) + + result = evaluation.to_dict() + assert result["final_score"] == 7.5 + assert result["grade"] == "B+" + assert result["metrics"]["word_count"] == 1000 + + +class TestReportEvaluatorIntegration: + """Integration tests for evaluator (may require LLM).""" + + @pytest.mark.asyncio + async def test_full_evaluation_without_llm(self): + """Test full evaluation with LLM disabled.""" + evaluator = ReportEvaluator(use_llm=False) + + report = """ +# Test Report + +## Key Points +- Key point 1 + +## Overview +Test overview. + +## Key Citations +- [Test](https://test.com) + """ + + result = await evaluator.evaluate(report, "test query") + + assert isinstance(result, CombinedEvaluation) + assert result.final_score > 0 + assert result.grade is not None + assert result.summary is not None + assert result.llm_evaluation is None + + +class TestLLMJudgeParseResponse: + """Tests for LLMJudge._parse_response method.""" + + @pytest.fixture + def judge(self): + """Create LLMJudge with mock LLM.""" + return LLMJudge(llm=MagicMock()) + + @pytest.fixture + def valid_response_data(self): + """Valid evaluation response data.""" + return { + "scores": { + "factual_accuracy": 8, + "completeness": 7, + "coherence": 9, + "relevance": 8, + "citation_quality": 6, + "writing_quality": 8, + }, + "overall_score": 8, + "strengths": ["Well researched", "Clear structure"], + "weaknesses": ["Could use more citations"], + "suggestions": ["Add more sources"], + } + + def test_parse_valid_json(self, judge, valid_response_data): + """Test parsing valid JSON response.""" + response = json.dumps(valid_response_data) + result = judge._parse_response(response) + + assert result["scores"]["factual_accuracy"] == 8 + assert result["overall_score"] == 8 + assert "Well researched" in result["strengths"] + + def test_parse_json_in_markdown_block(self, judge, valid_response_data): + """Test parsing JSON wrapped in markdown code block.""" + response = f"```json\n{json.dumps(valid_response_data)}\n```" + result = judge._parse_response(response) + + assert result["scores"]["coherence"] == 9 + assert result["overall_score"] == 8 + + def test_parse_json_in_generic_code_block(self, judge, valid_response_data): + """Test parsing JSON in generic code block.""" + response = f"```\n{json.dumps(valid_response_data)}\n```" + result = judge._parse_response(response) + + assert result["scores"]["relevance"] == 8 + + def test_parse_malformed_json_returns_defaults(self, judge): + """Test that malformed JSON returns default scores.""" + response = "This is not valid JSON at all" + result = judge._parse_response(response) + + assert result["scores"]["factual_accuracy"] == 5 + assert result["scores"]["completeness"] == 5 + assert result["overall_score"] == 5 + assert "Unable to parse evaluation" in result["strengths"] + assert "Evaluation parsing failed" in result["weaknesses"] + + def test_parse_incomplete_json(self, judge): + """Test parsing incomplete JSON.""" + response = '{"scores": {"factual_accuracy": 8}' # Missing closing braces + result = judge._parse_response(response) + + # Should return defaults due to parse failure + assert result["overall_score"] == 5 + + def test_parse_json_with_extra_text(self, judge, valid_response_data): + """Test parsing JSON with surrounding text.""" + response = f"Here is my evaluation:\n```json\n{json.dumps(valid_response_data)}\n```\nHope this helps!" + result = judge._parse_response(response) + + assert result["scores"]["factual_accuracy"] == 8 + + +class TestLLMJudgeCalculateWeightedScore: + """Tests for LLMJudge._calculate_weighted_score method.""" + + @pytest.fixture + def judge(self): + """Create LLMJudge with mock LLM.""" + return LLMJudge(llm=MagicMock()) + + def test_calculate_with_all_scores(self, judge): + """Test weighted score calculation with all criteria.""" + scores = { + "factual_accuracy": 10, # weight 0.25 + "completeness": 10, # weight 0.20 + "coherence": 10, # weight 0.20 + "relevance": 10, # weight 0.15 + "citation_quality": 10, # weight 0.10 + "writing_quality": 10, # weight 0.10 + } + result = judge._calculate_weighted_score(scores) + assert result == 10.0 + + def test_calculate_with_varied_scores(self, judge): + """Test weighted score with varied scores.""" + scores = { + "factual_accuracy": 8, # 8 * 0.25 = 2.0 + "completeness": 6, # 6 * 0.20 = 1.2 + "coherence": 7, # 7 * 0.20 = 1.4 + "relevance": 9, # 9 * 0.15 = 1.35 + "citation_quality": 5, # 5 * 0.10 = 0.5 + "writing_quality": 8, # 8 * 0.10 = 0.8 + } + # Total: 7.25 + result = judge._calculate_weighted_score(scores) + assert result == 7.25 + + def test_calculate_with_partial_scores(self, judge): + """Test weighted score with only some criteria.""" + scores = { + "factual_accuracy": 8, # weight 0.25 + "completeness": 6, # weight 0.20 + } + # (8 * 0.25 + 6 * 0.20) / (0.25 + 0.20) = 3.2 / 0.45 = 7.11 + result = judge._calculate_weighted_score(scores) + assert abs(result - 7.11) < 0.01 + + def test_calculate_with_unknown_criteria(self, judge): + """Test that unknown criteria are ignored.""" + scores = { + "factual_accuracy": 10, + "unknown_criterion": 1, # Should be ignored + } + result = judge._calculate_weighted_score(scores) + assert result == 10.0 + + def test_calculate_with_empty_scores(self, judge): + """Test with empty scores dict.""" + result = judge._calculate_weighted_score({}) + assert result == 0.0 + + def test_weights_sum_to_one(self): + """Verify that all criteria weights sum to 1.0.""" + total_weight = sum(c["weight"] for c in EVALUATION_CRITERIA.values()) + assert abs(total_weight - 1.0) < 0.001 + + +class TestLLMJudgeEvaluate: + """Tests for LLMJudge.evaluate method with mocked LLM.""" + + @pytest.fixture + def valid_llm_response(self): + """Create a valid LLM response.""" + return json.dumps( + { + "scores": { + "factual_accuracy": 8, + "completeness": 7, + "coherence": 9, + "relevance": 8, + "citation_quality": 7, + "writing_quality": 8, + }, + "overall_score": 8, + "strengths": ["Comprehensive coverage", "Well structured"], + "weaknesses": ["Some claims need more support"], + "suggestions": ["Add more academic sources"], + } + ) + + @pytest.mark.asyncio + async def test_successful_evaluation(self, valid_llm_response): + """Test successful LLM evaluation.""" + mock_llm = AsyncMock() + mock_response = MagicMock() + mock_response.content = valid_llm_response + mock_llm.ainvoke.return_value = mock_response + + judge = LLMJudge(llm=mock_llm) + result = await judge.evaluate("Test report", "Test query") + + assert isinstance(result, EvaluationResult) + assert result.scores["factual_accuracy"] == 8 + assert result.overall_score == 8 + assert result.weighted_score > 0 + assert "Comprehensive coverage" in result.strengths + assert result.raw_response == valid_llm_response + + @pytest.mark.asyncio + async def test_evaluation_with_llm_failure(self): + """Test that LLM failures are handled gracefully.""" + mock_llm = AsyncMock() + mock_llm.ainvoke.side_effect = Exception("LLM service unavailable") + + judge = LLMJudge(llm=mock_llm) + result = await judge.evaluate("Test report", "Test query") + + assert isinstance(result, EvaluationResult) + assert result.overall_score == 0 + assert result.weighted_score == 0 + assert all(score == 0 for score in result.scores.values()) + assert any("failed" in w.lower() for w in result.weaknesses) + + @pytest.mark.asyncio + async def test_evaluation_with_malformed_response(self): + """Test handling of malformed LLM response.""" + mock_llm = AsyncMock() + mock_response = MagicMock() + mock_response.content = "I cannot evaluate this report properly." + mock_llm.ainvoke.return_value = mock_response + + judge = LLMJudge(llm=mock_llm) + result = await judge.evaluate("Test report", "Test query") + + # Should return default scores + assert result.scores["factual_accuracy"] == 5 + assert result.overall_score == 5 + + @pytest.mark.asyncio + async def test_evaluation_passes_report_style(self): + """Test that report_style is passed to LLM.""" + mock_llm = AsyncMock() + mock_response = MagicMock() + mock_response.content = json.dumps( + { + "scores": {k: 7 for k in EVALUATION_CRITERIA.keys()}, + "overall_score": 7, + "strengths": [], + "weaknesses": [], + "suggestions": [], + } + ) + mock_llm.ainvoke.return_value = mock_response + + judge = LLMJudge(llm=mock_llm) + await judge.evaluate("Test report", "Test query", report_style="academic") + + # Verify the prompt contains the report style + call_args = mock_llm.ainvoke.call_args + messages = call_args[0][0] + user_message_content = messages[1].content + assert "academic" in user_message_content + + @pytest.mark.asyncio + async def test_evaluation_truncates_long_reports(self): + """Test that very long reports are truncated.""" + mock_llm = AsyncMock() + mock_response = MagicMock() + mock_response.content = json.dumps( + { + "scores": {k: 7 for k in EVALUATION_CRITERIA.keys()}, + "overall_score": 7, + "strengths": [], + "weaknesses": [], + "suggestions": [], + } + ) + mock_llm.ainvoke.return_value = mock_response + + judge = LLMJudge(llm=mock_llm) + long_report = "x" * (MAX_REPORT_LENGTH + 5000) + await judge.evaluate(long_report, "Test query") + + call_args = mock_llm.ainvoke.call_args + messages = call_args[0][0] + user_message_content = messages[1].content + # The report content in the message should be truncated to MAX_REPORT_LENGTH + assert len(user_message_content) < len(long_report) + 500 + + +class TestEvaluationResult: + """Tests for EvaluationResult dataclass.""" + + def test_to_dict(self): + """Test EvaluationResult.to_dict method.""" + result = EvaluationResult( + scores={"factual_accuracy": 8, "completeness": 7}, + overall_score=7.5, + weighted_score=7.6, + strengths=["Good research"], + weaknesses=["Needs more detail"], + suggestions=["Expand section 2"], + raw_response="test response", + ) + + d = result.to_dict() + assert d["scores"]["factual_accuracy"] == 8 + assert d["overall_score"] == 7.5 + assert d["weighted_score"] == 7.6 + assert "Good research" in d["strengths"] + # raw_response should not be in dict + assert "raw_response" not in d diff --git a/tests/unit/eval/test_metrics.py b/tests/unit/eval/test_metrics.py new file mode 100644 index 0000000..fbc38e1 --- /dev/null +++ b/tests/unit/eval/test_metrics.py @@ -0,0 +1,207 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +"""Unit tests for report evaluation metrics.""" + +from src.eval.metrics import ( + compute_metrics, + count_citations, + count_images, + count_words, + detect_sections, + extract_domains, + get_word_count_target, +) + + +class TestCountWords: + """Tests for word counting function.""" + + def test_english_words(self): + text = "This is a simple test sentence." + assert count_words(text) == 6 + + def test_chinese_characters(self): + text = "这是一个测试" + assert count_words(text) == 6 + + def test_mixed_content(self): + text = "Hello 你好 World 世界" + assert count_words(text) == 4 + 2 # 2 English + 4 Chinese + + def test_empty_string(self): + assert count_words("") == 0 + + +class TestCountCitations: + """Tests for citation counting function.""" + + def test_markdown_citations(self): + text = """ + Check out [Google](https://google.com) and [GitHub](https://github.com). + """ + assert count_citations(text) == 2 + + def test_no_citations(self): + text = "This is plain text without any links." + assert count_citations(text) == 0 + + def test_invalid_urls(self): + text = "[Link](not-a-url) [Another](ftp://ftp.example.com)" + assert count_citations(text) == 0 + + def test_complex_urls(self): + text = "[Article](https://example.com/path/to/article?id=123&ref=test)" + assert count_citations(text) == 1 + + +class TestExtractDomains: + """Tests for domain extraction function.""" + + def test_extract_multiple_domains(self): + text = """ + https://google.com/search + https://www.github.com/user/repo + https://docs.python.org/3/ + """ + domains = extract_domains(text) + assert len(domains) == 3 + assert "google.com" in domains + assert "github.com" in domains + assert "docs.python.org" in domains + + def test_deduplicate_domains(self): + text = """ + https://example.com/page1 + https://example.com/page2 + https://www.example.com/page3 + """ + domains = extract_domains(text) + assert len(domains) == 1 + assert "example.com" in domains + + def test_no_urls(self): + text = "Plain text without URLs" + assert extract_domains(text) == [] + + +class TestCountImages: + """Tests for image counting function.""" + + def test_markdown_images(self): + text = """ + ![Alt text](https://example.com/image1.png) + ![](https://example.com/image2.jpg) + """ + assert count_images(text) == 2 + + def test_no_images(self): + text = "Text without images [link](url)" + assert count_images(text) == 0 + + +class TestDetectSections: + """Tests for section detection function.""" + + def test_detect_title(self): + text = "# My Report Title\n\nSome content here." + sections = detect_sections(text) + assert sections.get("title") is True + + def test_detect_key_points(self): + text = "## Key Points\n- Point 1\n- Point 2" + sections = detect_sections(text) + assert sections.get("key_points") is True + + def test_detect_chinese_sections(self): + text = """# 报告标题 +## 要点 +- 要点1 +## 概述 +这是概述内容 + """ + sections = detect_sections(text) + assert sections.get("title") is True + assert sections.get("key_points") is True + assert sections.get("overview") is True + + def test_detect_citations_section(self): + text = """ + ## Key Citations + - [Source 1](https://example.com) + """ + sections = detect_sections(text) + assert sections.get("key_citations") is True + + +class TestComputeMetrics: + """Tests for the main compute_metrics function.""" + + def test_complete_report(self): + report = """ +# Research Report Title + +## Key Points +- Point 1 +- Point 2 +- Point 3 + +## Overview +This is an overview of the research topic. + +## Detailed Analysis +Here is the detailed analysis with [source](https://example.com). + +![Figure 1](https://example.com/image.png) + +## Key Citations +- [Source 1](https://example.com) +- [Source 2](https://another.com) + """ + metrics = compute_metrics(report) + + assert metrics.has_title is True + assert metrics.has_key_points is True + assert metrics.has_overview is True + assert metrics.has_citations_section is True + assert metrics.citation_count >= 2 + assert metrics.image_count == 1 + assert metrics.unique_sources >= 1 + assert metrics.section_coverage_score > 0.5 + + def test_minimal_report(self): + report = "Just some text without structure." + metrics = compute_metrics(report) + + assert metrics.has_title is False + assert metrics.citation_count == 0 + assert metrics.section_coverage_score < 0.5 + + def test_metrics_to_dict(self): + report = "# Title\n\nSome content" + metrics = compute_metrics(report) + result = metrics.to_dict() + + assert isinstance(result, dict) + assert "word_count" in result + assert "citation_count" in result + assert "section_coverage_score" in result + + +class TestGetWordCountTarget: + """Tests for word count target function.""" + + def test_strategic_investment_target(self): + target = get_word_count_target("strategic_investment") + assert target["min"] == 10000 + assert target["max"] == 15000 + + def test_news_target(self): + target = get_word_count_target("news") + assert target["min"] == 800 + assert target["max"] == 2000 + + def test_default_target(self): + target = get_word_count_target("unknown_style") + assert target["min"] == 1000 + assert target["max"] == 5000 diff --git a/web/messages/en.json b/web/messages/en.json index 7cb1619..826fc1a 100644 --- a/web/messages/en.json +++ b/web/messages/en.json @@ -150,6 +150,7 @@ "downloadWord": "Word (.docx)", "downloadImage": "Image (.png)", "exportFailed": "Export failed, please try again", + "evaluateReport": "Evaluate report quality", "searchingFor": "Searching for", "reading": "Reading", "runningPythonCode": "Running Python code", @@ -163,6 +164,31 @@ "errorGeneratingPodcast": "Error when generating podcast. Please try again.", "downloadPodcast": "Download podcast" }, + "evaluation": { + "title": "Report Quality Evaluation", + "description": "Evaluate your report using automated metrics and AI analysis.", + "evaluating": "Evaluating report...", + "analyzing": "Running deep analysis...", + "overallScore": "Overall Score", + "metrics": "Report Metrics", + "wordCount": "Word Count", + "citations": "Citations", + "sources": "Unique Sources", + "images": "Images", + "sectionCoverage": "Section Coverage", + "detailedAnalysis": "Detailed Analysis", + "deepEvaluation": "Deep Evaluation (AI)", + "strengths": "Strengths", + "weaknesses": "Areas for Improvement", + "scores": { + "factual_accuracy": "Factual Accuracy", + "completeness": "Completeness", + "coherence": "Coherence", + "relevance": "Relevance", + "citation_quality": "Citation Quality", + "writing_quality": "Writing Quality" + } + }, "messages": { "replaying": "Replaying", "replayDescription": "DeerFlow is now replaying the conversation...", diff --git a/web/messages/zh.json b/web/messages/zh.json index d51dff3..e84d9f9 100644 --- a/web/messages/zh.json +++ b/web/messages/zh.json @@ -150,6 +150,7 @@ "downloadWord": "Word (.docx)", "downloadImage": "图片 (.png)", "exportFailed": "导出失败,请重试", + "evaluateReport": "评估报告质量", "searchingFor": "搜索", "reading": "阅读中", "runningPythonCode": "运行 Python 代码", @@ -163,6 +164,31 @@ "errorGeneratingPodcast": "生成播客时出错。请重试。", "downloadPodcast": "下载播客" }, + "evaluation": { + "title": "报告质量评估", + "description": "使用自动化指标和 AI 分析评估您的报告。", + "evaluating": "正在评估报告...", + "analyzing": "正在进行深度分析...", + "overallScore": "总体评分", + "metrics": "报告指标", + "wordCount": "字数", + "citations": "引用数", + "sources": "独立来源", + "images": "图片数", + "sectionCoverage": "章节覆盖率", + "detailedAnalysis": "详细分析", + "deepEvaluation": "深度评估 (AI)", + "strengths": "优势", + "weaknesses": "改进建议", + "scores": { + "factual_accuracy": "事实准确性", + "completeness": "完整性", + "coherence": "连贯性", + "relevance": "相关性", + "citation_quality": "引用质量", + "writing_quality": "写作质量" + } + }, "messages": { "replaying": "回放中", "replayDescription": "DeerFlow 正在回放对话...", diff --git a/web/src/app/chat/components/evaluation-dialog.tsx b/web/src/app/chat/components/evaluation-dialog.tsx new file mode 100644 index 0000000..6c3524c --- /dev/null +++ b/web/src/app/chat/components/evaluation-dialog.tsx @@ -0,0 +1,300 @@ +// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +// SPDX-License-Identifier: MIT + +import { + BookOpen, + FileText, + Image, + Link2, + Loader2, + Sparkles, + ThumbsDown, + ThumbsUp, +} from "lucide-react"; +import { useTranslations } from "next-intl"; +import { useCallback, useEffect, useRef, useState } from "react"; + +import { Button } from "~/components/ui/button"; +import { + Dialog, + DialogContent, + DialogDescription, + DialogHeader, + DialogTitle, +} from "~/components/ui/dialog"; +import { Progress } from "~/components/ui/progress"; +import { evaluateReport, type EvaluationResult } from "~/core/api"; +import { cn } from "~/lib/utils"; + +interface EvaluationDialogProps { + open: boolean; + onOpenChange: (open: boolean) => void; + reportContent: string; + query: string; + reportStyle?: string; +} + +function GradeBadge({ grade }: { grade: string }) { + const gradeColors: Record = { + "A+": "bg-emerald-500", + A: "bg-emerald-500", + "A-": "bg-emerald-400", + "B+": "bg-blue-500", + B: "bg-blue-500", + "B-": "bg-blue-400", + "C+": "bg-yellow-500", + C: "bg-yellow-500", + "C-": "bg-yellow-400", + D: "bg-orange-500", + F: "bg-red-500", + }; + + return ( +
+ {grade} +
+ ); +} + +function MetricItem({ + icon: Icon, + label, + value, + suffix, +}: { + icon: React.ComponentType<{ className?: string }>; + label: string; + value: number | string; + suffix?: string; +}) { + return ( +
+ + {label} + + {value} + {suffix} + +
+ ); +} + +export function EvaluationDialog({ + open, + onOpenChange, + reportContent, + query, + reportStyle, +}: EvaluationDialogProps) { + const t = useTranslations("chat.evaluation"); + const [loading, setLoading] = useState(false); + const [deepLoading, setDeepLoading] = useState(false); + const [result, setResult] = useState(null); + const [error, setError] = useState(null); + const hasRunInitialEvaluation = useRef(false); + + const runEvaluation = useCallback( + async (useLlm: boolean) => { + if (useLlm) { + setDeepLoading(true); + } else { + setLoading(true); + } + setError(null); + + try { + const evalResult = await evaluateReport( + reportContent, + query, + reportStyle, + useLlm, + ); + setResult(evalResult); + } catch (err) { + setError(err instanceof Error ? err.message : "Evaluation failed"); + } finally { + setLoading(false); + setDeepLoading(false); + } + }, + [reportContent, query, reportStyle], + ); + + useEffect(() => { + if (open && !hasRunInitialEvaluation.current) { + hasRunInitialEvaluation.current = true; + void runEvaluation(false); + } + }, [open, runEvaluation]); + + useEffect(() => { + if (!open) { + setResult(null); + setError(null); + hasRunInitialEvaluation.current = false; + } + }, [open]); + + return ( + + + + {t("title")} + {t("description")} + + + {loading && !result ? ( +
+ +

+ {t("evaluating")} +

+
+ ) : error ? ( +
{error}
+ ) : result ? ( +
+ {/* Grade and Score */} +
+ +
+
{result.score}/10
+
+ {t("overallScore")} +
+
+
+ + {/* Metrics */} +
+

{t("metrics")}

+
+ + + + +
+
+ + {t("sectionCoverage")} + + + {Math.round(result.metrics.section_coverage_score * 100)}% + +
+ +
+
+
+ + {/* LLM Evaluation Results */} + {result.llm_evaluation && ( +
+

{t("detailedAnalysis")}

+ + {/* LLM Scores */} +
+ {Object.entries(result.llm_evaluation.scores).map( + ([key, value]) => ( +
+ + {t(`scores.${key}`)} + + {value}/10 +
+ ), + )} +
+ + {/* Strengths */} + {result.llm_evaluation.strengths.length > 0 && ( +
+
+ + {t("strengths")} +
+
    + {result.llm_evaluation.strengths + .slice(0, 3) + .map((s, i) => ( +
  • + • {s} +
  • + ))} +
+
+ )} + + {/* Weaknesses */} + {result.llm_evaluation.weaknesses.length > 0 && ( +
+
+ + {t("weaknesses")} +
+
    + {result.llm_evaluation.weaknesses + .slice(0, 3) + .map((w, i) => ( +
  • + • {w} +
  • + ))} +
+
+ )} +
+ )} + + {/* Deep Evaluation Button */} + {!result.llm_evaluation && ( + + )} +
+ ) : null} +
+
+ ); +} diff --git a/web/src/app/chat/components/research-block.tsx b/web/src/app/chat/components/research-block.tsx index 2880351..f4d1e3f 100644 --- a/web/src/app/chat/components/research-block.tsx +++ b/web/src/app/chat/components/research-block.tsx @@ -16,6 +16,7 @@ import { jsPDF } from "jspdf"; import { Check, Copy, + GraduationCap, Headphones, Pencil, Undo2, @@ -43,9 +44,10 @@ import { } from "~/components/ui/dropdown-menu"; import { Tabs, TabsContent, TabsList, TabsTrigger } from "~/components/ui/tabs"; import { useReplay } from "~/core/replay"; -import { closeResearch, listenToPodcast, useStore } from "~/core/store"; +import { closeResearch, getResearchQuery, listenToPodcast, useStore, useSettingsStore } from "~/core/store"; import { cn } from "~/lib/utils"; +import { EvaluationDialog } from "./evaluation-dialog"; import { ResearchActivitiesBlock } from "./research-activities-block"; import { ResearchReportBlock } from "./research-report-block"; @@ -84,6 +86,7 @@ export function ResearchBlock({ const [editing, setEditing] = useState(false); const [isDownloading, setIsDownloading] = useState(false); const [copied, setCopied] = useState(false); + const [showEvaluation, setShowEvaluation] = useState(false); const handleCopy = useCallback(() => { if (!reportId) { return; @@ -676,6 +679,16 @@ ${htmlContent} {copied ? : } + + + @@ -796,6 +809,19 @@ ${htmlContent} + + {/* Evaluation Dialog */} + {reportId && researchId && ( + + )} ); } diff --git a/web/src/components/ui/progress.tsx b/web/src/components/ui/progress.tsx new file mode 100644 index 0000000..248e284 --- /dev/null +++ b/web/src/components/ui/progress.tsx @@ -0,0 +1,30 @@ +"use client" + +import * as React from "react" + +import { cn } from "~/lib/utils" + +interface ProgressProps extends React.HTMLAttributes { + value?: number +} + +function Progress({ className, value = 0, ...props }: ProgressProps) { + return ( +
+
+
+ ) +} + +export { Progress } diff --git a/web/src/core/api/evaluate.ts b/web/src/core/api/evaluate.ts new file mode 100644 index 0000000..9d46f8a --- /dev/null +++ b/web/src/core/api/evaluate.ts @@ -0,0 +1,91 @@ +// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +// SPDX-License-Identifier: MIT + +import { resolveServiceURL } from "./resolve-service-url"; + +/** + * Report evaluation API client. + */ + +export interface EvaluationMetrics { + word_count: number; + citation_count: number; + unique_sources: number; + image_count: number; + section_count: number; + section_coverage_score: number; + sections_found: string[]; + sections_missing: string[]; + has_title: boolean; + has_key_points: boolean; + has_overview: boolean; + has_citations_section: boolean; +} + +export interface LLMEvaluationScores { + factual_accuracy: number; + completeness: number; + coherence: number; + relevance: number; + citation_quality: number; + writing_quality: number; +} + +export interface LLMEvaluation { + scores: LLMEvaluationScores; + overall_score: number; + weighted_score: number; + strengths: string[]; + weaknesses: string[]; + suggestions: string[]; +} + +export interface EvaluationResult { + metrics: EvaluationMetrics; + score: number; + grade: string; + llm_evaluation?: LLMEvaluation; + summary?: string; +} + +export interface EvaluateReportRequest { + content: string; + query: string; + report_style?: string; + use_llm?: boolean; +} + +/** + * Evaluate a report's quality using automated metrics and optionally LLM-as-Judge. + * + * @param content - Report markdown content + * @param query - Original research query + * @param reportStyle - Report style (academic, news, etc.) + * @param useLlm - Whether to use LLM for deep evaluation + * @returns Evaluation result with metrics, score, and grade + */ +export async function evaluateReport( + content: string, + query: string, + reportStyle?: string, + useLlm?: boolean, +): Promise { + const response = await fetch(resolveServiceURL("report/evaluate"), { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + content, + query, + report_style: reportStyle ?? "default", + use_llm: useLlm ?? false, + } satisfies EvaluateReportRequest), + }); + + if (!response.ok) { + throw new Error(`Evaluation failed: ${response.statusText}`); + } + + return response.json(); +} diff --git a/web/src/core/api/index.ts b/web/src/core/api/index.ts index e21a050..2ad2c65 100644 --- a/web/src/core/api/index.ts +++ b/web/src/core/api/index.ts @@ -2,6 +2,7 @@ // SPDX-License-Identifier: MIT export * from "./chat"; +export * from "./evaluate"; export * from "./mcp"; export * from "./podcast"; export * from "./prompt-enhancer"; diff --git a/web/src/core/store/store.ts b/web/src/core/store/store.ts index 79167fe..357b62d 100644 --- a/web/src/core/store/store.ts +++ b/web/src/core/store/store.ts @@ -24,6 +24,7 @@ export const useStore = create<{ researchPlanIds: Map; researchReportIds: Map; researchActivityIds: Map; + researchQueries: Map; ongoingResearchId: string | null; openResearchId: string | null; @@ -42,6 +43,7 @@ export const useStore = create<{ researchPlanIds: new Map(), researchReportIds: new Map(), researchActivityIds: new Map(), + researchQueries: new Map(), ongoingResearchId: null, openResearchId: null, @@ -267,11 +269,17 @@ function getOngoingResearchId() { function appendResearch(researchId: string) { let planMessage: Message | undefined; + let userQuery: string | undefined; const reversedMessageIds = [...useStore.getState().messageIds].reverse(); for (const messageId of reversedMessageIds) { const message = getMessage(messageId); - if (message?.agent === "planner") { + if (!planMessage && message?.agent === "planner") { planMessage = message; + } + if (!userQuery && message?.role === "user") { + userQuery = message.content; + } + if (planMessage && userQuery) { break; } } @@ -288,6 +296,10 @@ function appendResearch(researchId: string) { researchId, messageIds, ), + researchQueries: new Map(useStore.getState().researchQueries).set( + researchId, + userQuery ?? "", + ), }); } @@ -394,6 +406,10 @@ export function useResearchMessage(researchId: string) { ); } +export function getResearchQuery(researchId: string): string { + return useStore.getState().researchQueries.get(researchId) ?? ""; +} + export function useMessage(messageId: string | null | undefined) { return useStore( useShallow((state) =>