mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-19 04:14:46 +08:00
feat(eval): add report quality evaluation module and UI integration (#776)
* feat(eval): add report quality evaluation module Addresses issue #773 - How to evaluate generated report quality objectively. This module provides two evaluation approaches: 1. Automated metrics (no LLM required): - Citation count and source diversity - Word count compliance per report style - Section structure validation - Image inclusion tracking 2. LLM-as-Judge evaluation: - Factual accuracy scoring - Completeness assessment - Coherence evaluation - Relevance and citation quality checks The combined evaluator provides a final score (1-10) and letter grade (A+ to F). Files added: - src/eval/__init__.py - src/eval/metrics.py - src/eval/llm_judge.py - src/eval/evaluator.py - tests/unit/eval/test_metrics.py - tests/unit/eval/test_evaluator.py * feat(eval): integrate report evaluation with web UI This commit adds the web UI integration for the evaluation module: Backend: - Add EvaluateReportRequest/Response models in src/server/eval_request.py - Add /api/report/evaluate endpoint to src/server/app.py Frontend: - Add evaluateReport API function in web/src/core/api/evaluate.ts - Create EvaluationDialog component with grade badge, metrics display, and optional LLM deep evaluation - Add evaluation button (graduation cap icon) to research-block.tsx toolbar - Add i18n translations for English and Chinese The evaluation UI allows users to: 1. View quick metrics-only evaluation (instant) 2. Optionally run deep LLM-based evaluation for detailed analysis 3. See grade (A+ to F), score (1-10), and metric breakdown * feat(eval): improve evaluation reliability and add LLM judge tests - Extract MAX_REPORT_LENGTH constant in llm_judge.py for maintainability - Add comprehensive unit tests for LLMJudge class (parse_response, calculate_weighted_score, evaluate with mocked LLM) - Pass reportStyle prop to EvaluationDialog for accurate evaluation criteria - Add researchQueries store map to reliably associate queries with research - Add getResearchQuery helper to retrieve query by researchId - Remove unused imports in test_metrics.py * fix(eval): use resolveServiceURL for evaluate API endpoint The evaluateReport function was using a relative URL '/api/report/evaluate' which sent requests to the Next.js server instead of the FastAPI backend. Changed to use resolveServiceURL() consistent with other API functions. * fix: improve type accuracy and React hooks in evaluation components - Fix get_word_count_target return type from Optional[Dict] to Dict since it always returns a value via default fallback - Fix useEffect dependency issue in EvaluationDialog using useRef to prevent unwanted re-evaluations - Add aria-label to GradeBadge for screen reader accessibility
This commit is contained in:
@@ -150,6 +150,7 @@
|
||||
"downloadWord": "Word (.docx)",
|
||||
"downloadImage": "图片 (.png)",
|
||||
"exportFailed": "导出失败,请重试",
|
||||
"evaluateReport": "评估报告质量",
|
||||
"searchingFor": "搜索",
|
||||
"reading": "阅读中",
|
||||
"runningPythonCode": "运行 Python 代码",
|
||||
@@ -163,6 +164,31 @@
|
||||
"errorGeneratingPodcast": "生成播客时出错。请重试。",
|
||||
"downloadPodcast": "下载播客"
|
||||
},
|
||||
"evaluation": {
|
||||
"title": "报告质量评估",
|
||||
"description": "使用自动化指标和 AI 分析评估您的报告。",
|
||||
"evaluating": "正在评估报告...",
|
||||
"analyzing": "正在进行深度分析...",
|
||||
"overallScore": "总体评分",
|
||||
"metrics": "报告指标",
|
||||
"wordCount": "字数",
|
||||
"citations": "引用数",
|
||||
"sources": "独立来源",
|
||||
"images": "图片数",
|
||||
"sectionCoverage": "章节覆盖率",
|
||||
"detailedAnalysis": "详细分析",
|
||||
"deepEvaluation": "深度评估 (AI)",
|
||||
"strengths": "优势",
|
||||
"weaknesses": "改进建议",
|
||||
"scores": {
|
||||
"factual_accuracy": "事实准确性",
|
||||
"completeness": "完整性",
|
||||
"coherence": "连贯性",
|
||||
"relevance": "相关性",
|
||||
"citation_quality": "引用质量",
|
||||
"writing_quality": "写作质量"
|
||||
}
|
||||
},
|
||||
"messages": {
|
||||
"replaying": "回放中",
|
||||
"replayDescription": "DeerFlow 正在回放对话...",
|
||||
|
||||
Reference in New Issue
Block a user