mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-13 10:24:44 +08:00
feat(eval): add report quality evaluation module and UI integration (#776)
* feat(eval): add report quality evaluation module Addresses issue #773 - How to evaluate generated report quality objectively. This module provides two evaluation approaches: 1. Automated metrics (no LLM required): - Citation count and source diversity - Word count compliance per report style - Section structure validation - Image inclusion tracking 2. LLM-as-Judge evaluation: - Factual accuracy scoring - Completeness assessment - Coherence evaluation - Relevance and citation quality checks The combined evaluator provides a final score (1-10) and letter grade (A+ to F). Files added: - src/eval/__init__.py - src/eval/metrics.py - src/eval/llm_judge.py - src/eval/evaluator.py - tests/unit/eval/test_metrics.py - tests/unit/eval/test_evaluator.py * feat(eval): integrate report evaluation with web UI This commit adds the web UI integration for the evaluation module: Backend: - Add EvaluateReportRequest/Response models in src/server/eval_request.py - Add /api/report/evaluate endpoint to src/server/app.py Frontend: - Add evaluateReport API function in web/src/core/api/evaluate.ts - Create EvaluationDialog component with grade badge, metrics display, and optional LLM deep evaluation - Add evaluation button (graduation cap icon) to research-block.tsx toolbar - Add i18n translations for English and Chinese The evaluation UI allows users to: 1. View quick metrics-only evaluation (instant) 2. Optionally run deep LLM-based evaluation for detailed analysis 3. See grade (A+ to F), score (1-10), and metric breakdown * feat(eval): improve evaluation reliability and add LLM judge tests - Extract MAX_REPORT_LENGTH constant in llm_judge.py for maintainability - Add comprehensive unit tests for LLMJudge class (parse_response, calculate_weighted_score, evaluate with mocked LLM) - Pass reportStyle prop to EvaluationDialog for accurate evaluation criteria - Add researchQueries store map to reliably associate queries with research - Add getResearchQuery helper to retrieve query by researchId - Remove unused imports in test_metrics.py * fix(eval): use resolveServiceURL for evaluate API endpoint The evaluateReport function was using a relative URL '/api/report/evaluate' which sent requests to the Next.js server instead of the FastAPI backend. Changed to use resolveServiceURL() consistent with other API functions. * fix: improve type accuracy and React hooks in evaluation components - Fix get_word_count_target return type from Optional[Dict] to Dict since it always returns a value via default fallback - Fix useEffect dependency issue in EvaluationDialog using useRef to prevent unwanted re-evaluations - Add aria-label to GradeBadge for screen reader accessibility
This commit is contained in:
300
web/src/app/chat/components/evaluation-dialog.tsx
Normal file
300
web/src/app/chat/components/evaluation-dialog.tsx
Normal file
@@ -0,0 +1,300 @@
|
||||
// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
import {
|
||||
BookOpen,
|
||||
FileText,
|
||||
Image,
|
||||
Link2,
|
||||
Loader2,
|
||||
Sparkles,
|
||||
ThumbsDown,
|
||||
ThumbsUp,
|
||||
} from "lucide-react";
|
||||
import { useTranslations } from "next-intl";
|
||||
import { useCallback, useEffect, useRef, useState } from "react";
|
||||
|
||||
import { Button } from "~/components/ui/button";
|
||||
import {
|
||||
Dialog,
|
||||
DialogContent,
|
||||
DialogDescription,
|
||||
DialogHeader,
|
||||
DialogTitle,
|
||||
} from "~/components/ui/dialog";
|
||||
import { Progress } from "~/components/ui/progress";
|
||||
import { evaluateReport, type EvaluationResult } from "~/core/api";
|
||||
import { cn } from "~/lib/utils";
|
||||
|
||||
interface EvaluationDialogProps {
|
||||
open: boolean;
|
||||
onOpenChange: (open: boolean) => void;
|
||||
reportContent: string;
|
||||
query: string;
|
||||
reportStyle?: string;
|
||||
}
|
||||
|
||||
function GradeBadge({ grade }: { grade: string }) {
|
||||
const gradeColors: Record<string, string> = {
|
||||
"A+": "bg-emerald-500",
|
||||
A: "bg-emerald-500",
|
||||
"A-": "bg-emerald-400",
|
||||
"B+": "bg-blue-500",
|
||||
B: "bg-blue-500",
|
||||
"B-": "bg-blue-400",
|
||||
"C+": "bg-yellow-500",
|
||||
C: "bg-yellow-500",
|
||||
"C-": "bg-yellow-400",
|
||||
D: "bg-orange-500",
|
||||
F: "bg-red-500",
|
||||
};
|
||||
|
||||
return (
|
||||
<div
|
||||
aria-label={`Report grade: ${grade}`}
|
||||
className={cn(
|
||||
"flex h-16 w-16 items-center justify-center rounded-full text-2xl font-bold text-white",
|
||||
gradeColors[grade] ?? "bg-gray-500",
|
||||
)}
|
||||
>
|
||||
{grade}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function MetricItem({
|
||||
icon: Icon,
|
||||
label,
|
||||
value,
|
||||
suffix,
|
||||
}: {
|
||||
icon: React.ComponentType<{ className?: string }>;
|
||||
label: string;
|
||||
value: number | string;
|
||||
suffix?: string;
|
||||
}) {
|
||||
return (
|
||||
<div className="flex items-center gap-3">
|
||||
<Icon className="text-muted-foreground h-4 w-4" />
|
||||
<span className="text-muted-foreground text-sm">{label}</span>
|
||||
<span className="ml-auto font-medium">
|
||||
{value}
|
||||
{suffix}
|
||||
</span>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
export function EvaluationDialog({
|
||||
open,
|
||||
onOpenChange,
|
||||
reportContent,
|
||||
query,
|
||||
reportStyle,
|
||||
}: EvaluationDialogProps) {
|
||||
const t = useTranslations("chat.evaluation");
|
||||
const [loading, setLoading] = useState(false);
|
||||
const [deepLoading, setDeepLoading] = useState(false);
|
||||
const [result, setResult] = useState<EvaluationResult | null>(null);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
const hasRunInitialEvaluation = useRef(false);
|
||||
|
||||
const runEvaluation = useCallback(
|
||||
async (useLlm: boolean) => {
|
||||
if (useLlm) {
|
||||
setDeepLoading(true);
|
||||
} else {
|
||||
setLoading(true);
|
||||
}
|
||||
setError(null);
|
||||
|
||||
try {
|
||||
const evalResult = await evaluateReport(
|
||||
reportContent,
|
||||
query,
|
||||
reportStyle,
|
||||
useLlm,
|
||||
);
|
||||
setResult(evalResult);
|
||||
} catch (err) {
|
||||
setError(err instanceof Error ? err.message : "Evaluation failed");
|
||||
} finally {
|
||||
setLoading(false);
|
||||
setDeepLoading(false);
|
||||
}
|
||||
},
|
||||
[reportContent, query, reportStyle],
|
||||
);
|
||||
|
||||
useEffect(() => {
|
||||
if (open && !hasRunInitialEvaluation.current) {
|
||||
hasRunInitialEvaluation.current = true;
|
||||
void runEvaluation(false);
|
||||
}
|
||||
}, [open, runEvaluation]);
|
||||
|
||||
useEffect(() => {
|
||||
if (!open) {
|
||||
setResult(null);
|
||||
setError(null);
|
||||
hasRunInitialEvaluation.current = false;
|
||||
}
|
||||
}, [open]);
|
||||
|
||||
return (
|
||||
<Dialog open={open} onOpenChange={onOpenChange}>
|
||||
<DialogContent className="sm:max-w-md">
|
||||
<DialogHeader>
|
||||
<DialogTitle>{t("title")}</DialogTitle>
|
||||
<DialogDescription>{t("description")}</DialogDescription>
|
||||
</DialogHeader>
|
||||
|
||||
{loading && !result ? (
|
||||
<div className="flex flex-col items-center justify-center py-8">
|
||||
<Loader2 className="h-8 w-8 animate-spin text-blue-500" />
|
||||
<p className="text-muted-foreground mt-4 text-sm">
|
||||
{t("evaluating")}
|
||||
</p>
|
||||
</div>
|
||||
) : error ? (
|
||||
<div className="py-4 text-center text-red-500">{error}</div>
|
||||
) : result ? (
|
||||
<div className="space-y-6">
|
||||
{/* Grade and Score */}
|
||||
<div className="flex items-center gap-6">
|
||||
<GradeBadge grade={result.grade} />
|
||||
<div>
|
||||
<div className="text-3xl font-bold">{result.score}/10</div>
|
||||
<div className="text-muted-foreground text-sm">
|
||||
{t("overallScore")}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Metrics */}
|
||||
<div className="space-y-3">
|
||||
<h4 className="text-sm font-medium">{t("metrics")}</h4>
|
||||
<div className="bg-muted/50 space-y-2 rounded-lg p-3">
|
||||
<MetricItem
|
||||
icon={FileText}
|
||||
label={t("wordCount")}
|
||||
value={result.metrics.word_count.toLocaleString()}
|
||||
/>
|
||||
<MetricItem
|
||||
icon={Link2}
|
||||
label={t("citations")}
|
||||
value={result.metrics.citation_count}
|
||||
/>
|
||||
<MetricItem
|
||||
icon={BookOpen}
|
||||
label={t("sources")}
|
||||
value={result.metrics.unique_sources}
|
||||
/>
|
||||
<MetricItem
|
||||
icon={Image}
|
||||
label={t("images")}
|
||||
value={result.metrics.image_count}
|
||||
/>
|
||||
<div className="pt-2">
|
||||
<div className="mb-1 flex items-center justify-between text-sm">
|
||||
<span className="text-muted-foreground">
|
||||
{t("sectionCoverage")}
|
||||
</span>
|
||||
<span className="font-medium">
|
||||
{Math.round(result.metrics.section_coverage_score * 100)}%
|
||||
</span>
|
||||
</div>
|
||||
<Progress
|
||||
value={result.metrics.section_coverage_score * 100}
|
||||
className="h-2"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* LLM Evaluation Results */}
|
||||
{result.llm_evaluation && (
|
||||
<div className="space-y-3">
|
||||
<h4 className="text-sm font-medium">{t("detailedAnalysis")}</h4>
|
||||
|
||||
{/* LLM Scores */}
|
||||
<div className="bg-muted/50 grid grid-cols-2 gap-2 rounded-lg p-3 text-sm">
|
||||
{Object.entries(result.llm_evaluation.scores).map(
|
||||
([key, value]) => (
|
||||
<div key={key} className="flex justify-between">
|
||||
<span className="text-muted-foreground">
|
||||
{t(`scores.${key}`)}
|
||||
</span>
|
||||
<span className="font-medium">{value}/10</span>
|
||||
</div>
|
||||
),
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Strengths */}
|
||||
{result.llm_evaluation.strengths.length > 0 && (
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center gap-2 text-sm font-medium text-emerald-600">
|
||||
<ThumbsUp className="h-4 w-4" />
|
||||
{t("strengths")}
|
||||
</div>
|
||||
<ul className="space-y-1 text-sm">
|
||||
{result.llm_evaluation.strengths
|
||||
.slice(0, 3)
|
||||
.map((s, i) => (
|
||||
<li key={i} className="text-muted-foreground">
|
||||
• {s}
|
||||
</li>
|
||||
))}
|
||||
</ul>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Weaknesses */}
|
||||
{result.llm_evaluation.weaknesses.length > 0 && (
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center gap-2 text-sm font-medium text-orange-600">
|
||||
<ThumbsDown className="h-4 w-4" />
|
||||
{t("weaknesses")}
|
||||
</div>
|
||||
<ul className="space-y-1 text-sm">
|
||||
{result.llm_evaluation.weaknesses
|
||||
.slice(0, 3)
|
||||
.map((w, i) => (
|
||||
<li key={i} className="text-muted-foreground">
|
||||
• {w}
|
||||
</li>
|
||||
))}
|
||||
</ul>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Deep Evaluation Button */}
|
||||
{!result.llm_evaluation && (
|
||||
<Button
|
||||
variant="outline"
|
||||
className="w-full"
|
||||
onClick={() => runEvaluation(true)}
|
||||
disabled={deepLoading}
|
||||
>
|
||||
{deepLoading ? (
|
||||
<>
|
||||
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
|
||||
{t("analyzing")}
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
<Sparkles className="mr-2 h-4 w-4" />
|
||||
{t("deepEvaluation")}
|
||||
</>
|
||||
)}
|
||||
</Button>
|
||||
)}
|
||||
</div>
|
||||
) : null}
|
||||
</DialogContent>
|
||||
</Dialog>
|
||||
);
|
||||
}
|
||||
@@ -16,6 +16,7 @@ import { jsPDF } from "jspdf";
|
||||
import {
|
||||
Check,
|
||||
Copy,
|
||||
GraduationCap,
|
||||
Headphones,
|
||||
Pencil,
|
||||
Undo2,
|
||||
@@ -43,9 +44,10 @@ import {
|
||||
} from "~/components/ui/dropdown-menu";
|
||||
import { Tabs, TabsContent, TabsList, TabsTrigger } from "~/components/ui/tabs";
|
||||
import { useReplay } from "~/core/replay";
|
||||
import { closeResearch, listenToPodcast, useStore } from "~/core/store";
|
||||
import { closeResearch, getResearchQuery, listenToPodcast, useStore, useSettingsStore } from "~/core/store";
|
||||
import { cn } from "~/lib/utils";
|
||||
|
||||
import { EvaluationDialog } from "./evaluation-dialog";
|
||||
import { ResearchActivitiesBlock } from "./research-activities-block";
|
||||
import { ResearchReportBlock } from "./research-report-block";
|
||||
|
||||
@@ -84,6 +86,7 @@ export function ResearchBlock({
|
||||
const [editing, setEditing] = useState(false);
|
||||
const [isDownloading, setIsDownloading] = useState(false);
|
||||
const [copied, setCopied] = useState(false);
|
||||
const [showEvaluation, setShowEvaluation] = useState(false);
|
||||
const handleCopy = useCallback(() => {
|
||||
if (!reportId) {
|
||||
return;
|
||||
@@ -676,6 +679,16 @@ ${htmlContent}
|
||||
{copied ? <Check /> : <Copy />}
|
||||
</Button>
|
||||
</Tooltip>
|
||||
<Tooltip title={t("evaluateReport")}>
|
||||
<Button
|
||||
className="text-gray-400"
|
||||
size="icon"
|
||||
variant="ghost"
|
||||
onClick={() => setShowEvaluation(true)}
|
||||
>
|
||||
<GraduationCap />
|
||||
</Button>
|
||||
</Tooltip>
|
||||
<DropdownMenu>
|
||||
<Tooltip title={t("downloadReport")}>
|
||||
<DropdownMenuTrigger asChild>
|
||||
@@ -796,6 +809,19 @@ ${htmlContent}
|
||||
</TabsContent>
|
||||
</Tabs>
|
||||
</Card>
|
||||
|
||||
{/* Evaluation Dialog */}
|
||||
{reportId && researchId && (
|
||||
<EvaluationDialog
|
||||
open={showEvaluation}
|
||||
onOpenChange={setShowEvaluation}
|
||||
reportContent={
|
||||
useStore.getState().messages.get(reportId)?.content ?? ""
|
||||
}
|
||||
query={getResearchQuery(researchId)}
|
||||
reportStyle={useSettingsStore.getState().general.reportStyle.toLowerCase()}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
30
web/src/components/ui/progress.tsx
Normal file
30
web/src/components/ui/progress.tsx
Normal file
@@ -0,0 +1,30 @@
|
||||
"use client"
|
||||
|
||||
import * as React from "react"
|
||||
|
||||
import { cn } from "~/lib/utils"
|
||||
|
||||
interface ProgressProps extends React.HTMLAttributes<HTMLDivElement> {
|
||||
value?: number
|
||||
}
|
||||
|
||||
function Progress({ className, value = 0, ...props }: ProgressProps) {
|
||||
return (
|
||||
<div
|
||||
data-slot="progress"
|
||||
className={cn(
|
||||
"bg-primary/20 relative h-2 w-full overflow-hidden rounded-full",
|
||||
className
|
||||
)}
|
||||
{...props}
|
||||
>
|
||||
<div
|
||||
data-slot="progress-indicator"
|
||||
className="bg-primary h-full transition-all duration-300 ease-in-out"
|
||||
style={{ width: `${Math.min(100, Math.max(0, value))}%` }}
|
||||
/>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export { Progress }
|
||||
91
web/src/core/api/evaluate.ts
Normal file
91
web/src/core/api/evaluate.ts
Normal file
@@ -0,0 +1,91 @@
|
||||
// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
import { resolveServiceURL } from "./resolve-service-url";
|
||||
|
||||
/**
|
||||
* Report evaluation API client.
|
||||
*/
|
||||
|
||||
export interface EvaluationMetrics {
|
||||
word_count: number;
|
||||
citation_count: number;
|
||||
unique_sources: number;
|
||||
image_count: number;
|
||||
section_count: number;
|
||||
section_coverage_score: number;
|
||||
sections_found: string[];
|
||||
sections_missing: string[];
|
||||
has_title: boolean;
|
||||
has_key_points: boolean;
|
||||
has_overview: boolean;
|
||||
has_citations_section: boolean;
|
||||
}
|
||||
|
||||
export interface LLMEvaluationScores {
|
||||
factual_accuracy: number;
|
||||
completeness: number;
|
||||
coherence: number;
|
||||
relevance: number;
|
||||
citation_quality: number;
|
||||
writing_quality: number;
|
||||
}
|
||||
|
||||
export interface LLMEvaluation {
|
||||
scores: LLMEvaluationScores;
|
||||
overall_score: number;
|
||||
weighted_score: number;
|
||||
strengths: string[];
|
||||
weaknesses: string[];
|
||||
suggestions: string[];
|
||||
}
|
||||
|
||||
export interface EvaluationResult {
|
||||
metrics: EvaluationMetrics;
|
||||
score: number;
|
||||
grade: string;
|
||||
llm_evaluation?: LLMEvaluation;
|
||||
summary?: string;
|
||||
}
|
||||
|
||||
export interface EvaluateReportRequest {
|
||||
content: string;
|
||||
query: string;
|
||||
report_style?: string;
|
||||
use_llm?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate a report's quality using automated metrics and optionally LLM-as-Judge.
|
||||
*
|
||||
* @param content - Report markdown content
|
||||
* @param query - Original research query
|
||||
* @param reportStyle - Report style (academic, news, etc.)
|
||||
* @param useLlm - Whether to use LLM for deep evaluation
|
||||
* @returns Evaluation result with metrics, score, and grade
|
||||
*/
|
||||
export async function evaluateReport(
|
||||
content: string,
|
||||
query: string,
|
||||
reportStyle?: string,
|
||||
useLlm?: boolean,
|
||||
): Promise<EvaluationResult> {
|
||||
const response = await fetch(resolveServiceURL("report/evaluate"), {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
content,
|
||||
query,
|
||||
report_style: reportStyle ?? "default",
|
||||
use_llm: useLlm ?? false,
|
||||
} satisfies EvaluateReportRequest),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Evaluation failed: ${response.statusText}`);
|
||||
}
|
||||
|
||||
return response.json();
|
||||
}
|
||||
@@ -2,6 +2,7 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
export * from "./chat";
|
||||
export * from "./evaluate";
|
||||
export * from "./mcp";
|
||||
export * from "./podcast";
|
||||
export * from "./prompt-enhancer";
|
||||
|
||||
@@ -24,6 +24,7 @@ export const useStore = create<{
|
||||
researchPlanIds: Map<string, string>;
|
||||
researchReportIds: Map<string, string>;
|
||||
researchActivityIds: Map<string, string[]>;
|
||||
researchQueries: Map<string, string>;
|
||||
ongoingResearchId: string | null;
|
||||
openResearchId: string | null;
|
||||
|
||||
@@ -42,6 +43,7 @@ export const useStore = create<{
|
||||
researchPlanIds: new Map<string, string>(),
|
||||
researchReportIds: new Map<string, string>(),
|
||||
researchActivityIds: new Map<string, string[]>(),
|
||||
researchQueries: new Map<string, string>(),
|
||||
ongoingResearchId: null,
|
||||
openResearchId: null,
|
||||
|
||||
@@ -267,11 +269,17 @@ function getOngoingResearchId() {
|
||||
|
||||
function appendResearch(researchId: string) {
|
||||
let planMessage: Message | undefined;
|
||||
let userQuery: string | undefined;
|
||||
const reversedMessageIds = [...useStore.getState().messageIds].reverse();
|
||||
for (const messageId of reversedMessageIds) {
|
||||
const message = getMessage(messageId);
|
||||
if (message?.agent === "planner") {
|
||||
if (!planMessage && message?.agent === "planner") {
|
||||
planMessage = message;
|
||||
}
|
||||
if (!userQuery && message?.role === "user") {
|
||||
userQuery = message.content;
|
||||
}
|
||||
if (planMessage && userQuery) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -288,6 +296,10 @@ function appendResearch(researchId: string) {
|
||||
researchId,
|
||||
messageIds,
|
||||
),
|
||||
researchQueries: new Map(useStore.getState().researchQueries).set(
|
||||
researchId,
|
||||
userQuery ?? "",
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -394,6 +406,10 @@ export function useResearchMessage(researchId: string) {
|
||||
);
|
||||
}
|
||||
|
||||
export function getResearchQuery(researchId: string): string {
|
||||
return useStore.getState().researchQueries.get(researchId) ?? "";
|
||||
}
|
||||
|
||||
export function useMessage(messageId: string | null | undefined) {
|
||||
return useStore(
|
||||
useShallow((state) =>
|
||||
|
||||
Reference in New Issue
Block a user