feat(eval): add report quality evaluation module and UI integration (#776)

* feat(eval): add report quality evaluation module

Addresses issue #773 - How to evaluate generated report quality objectively.

This module provides two evaluation approaches:
1. Automated metrics (no LLM required):
   - Citation count and source diversity
   - Word count compliance per report style
   - Section structure validation
   - Image inclusion tracking

2. LLM-as-Judge evaluation:
   - Factual accuracy scoring
   - Completeness assessment
   - Coherence evaluation
   - Relevance and citation quality checks

The combined evaluator provides a final score (1-10) and letter grade (A+ to F).

Files added:
- src/eval/__init__.py
- src/eval/metrics.py
- src/eval/llm_judge.py
- src/eval/evaluator.py
- tests/unit/eval/test_metrics.py
- tests/unit/eval/test_evaluator.py

* feat(eval): integrate report evaluation with web UI

This commit adds the web UI integration for the evaluation module:

Backend:
- Add EvaluateReportRequest/Response models in src/server/eval_request.py
- Add /api/report/evaluate endpoint to src/server/app.py

Frontend:
- Add evaluateReport API function in web/src/core/api/evaluate.ts
- Create EvaluationDialog component with grade badge, metrics display,
  and optional LLM deep evaluation
- Add evaluation button (graduation cap icon) to research-block.tsx toolbar
- Add i18n translations for English and Chinese

The evaluation UI allows users to:
1. View quick metrics-only evaluation (instant)
2. Optionally run deep LLM-based evaluation for detailed analysis
3. See grade (A+ to F), score (1-10), and metric breakdown

* feat(eval): improve evaluation reliability and add LLM judge tests

- Extract MAX_REPORT_LENGTH constant in llm_judge.py for maintainability
- Add comprehensive unit tests for LLMJudge class (parse_response,
  calculate_weighted_score, evaluate with mocked LLM)
- Pass reportStyle prop to EvaluationDialog for accurate evaluation criteria
- Add researchQueries store map to reliably associate queries with research
- Add getResearchQuery helper to retrieve query by researchId
- Remove unused imports in test_metrics.py

* fix(eval): use resolveServiceURL for evaluate API endpoint

The evaluateReport function was using a relative URL '/api/report/evaluate'
which sent requests to the Next.js server instead of the FastAPI backend.
Changed to use resolveServiceURL() consistent with other API functions.

* fix: improve type accuracy and React hooks in evaluation components

- Fix get_word_count_target return type from Optional[Dict] to Dict since it always returns a value via default fallback
- Fix useEffect dependency issue in EvaluationDialog using useRef to prevent unwanted re-evaluations
- Add aria-label to GradeBadge for screen reader accessibility
This commit is contained in:
Willem Jiang
2025-12-25 21:55:48 +08:00
committed by GitHub
parent 84a7f7815c
commit 8d9d767051
17 changed files with 2103 additions and 2 deletions

View File

@@ -0,0 +1,300 @@
// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
// SPDX-License-Identifier: MIT
import {
BookOpen,
FileText,
Image,
Link2,
Loader2,
Sparkles,
ThumbsDown,
ThumbsUp,
} from "lucide-react";
import { useTranslations } from "next-intl";
import { useCallback, useEffect, useRef, useState } from "react";
import { Button } from "~/components/ui/button";
import {
Dialog,
DialogContent,
DialogDescription,
DialogHeader,
DialogTitle,
} from "~/components/ui/dialog";
import { Progress } from "~/components/ui/progress";
import { evaluateReport, type EvaluationResult } from "~/core/api";
import { cn } from "~/lib/utils";
interface EvaluationDialogProps {
open: boolean;
onOpenChange: (open: boolean) => void;
reportContent: string;
query: string;
reportStyle?: string;
}
function GradeBadge({ grade }: { grade: string }) {
const gradeColors: Record<string, string> = {
"A+": "bg-emerald-500",
A: "bg-emerald-500",
"A-": "bg-emerald-400",
"B+": "bg-blue-500",
B: "bg-blue-500",
"B-": "bg-blue-400",
"C+": "bg-yellow-500",
C: "bg-yellow-500",
"C-": "bg-yellow-400",
D: "bg-orange-500",
F: "bg-red-500",
};
return (
<div
aria-label={`Report grade: ${grade}`}
className={cn(
"flex h-16 w-16 items-center justify-center rounded-full text-2xl font-bold text-white",
gradeColors[grade] ?? "bg-gray-500",
)}
>
{grade}
</div>
);
}
function MetricItem({
icon: Icon,
label,
value,
suffix,
}: {
icon: React.ComponentType<{ className?: string }>;
label: string;
value: number | string;
suffix?: string;
}) {
return (
<div className="flex items-center gap-3">
<Icon className="text-muted-foreground h-4 w-4" />
<span className="text-muted-foreground text-sm">{label}</span>
<span className="ml-auto font-medium">
{value}
{suffix}
</span>
</div>
);
}
export function EvaluationDialog({
open,
onOpenChange,
reportContent,
query,
reportStyle,
}: EvaluationDialogProps) {
const t = useTranslations("chat.evaluation");
const [loading, setLoading] = useState(false);
const [deepLoading, setDeepLoading] = useState(false);
const [result, setResult] = useState<EvaluationResult | null>(null);
const [error, setError] = useState<string | null>(null);
const hasRunInitialEvaluation = useRef(false);
const runEvaluation = useCallback(
async (useLlm: boolean) => {
if (useLlm) {
setDeepLoading(true);
} else {
setLoading(true);
}
setError(null);
try {
const evalResult = await evaluateReport(
reportContent,
query,
reportStyle,
useLlm,
);
setResult(evalResult);
} catch (err) {
setError(err instanceof Error ? err.message : "Evaluation failed");
} finally {
setLoading(false);
setDeepLoading(false);
}
},
[reportContent, query, reportStyle],
);
useEffect(() => {
if (open && !hasRunInitialEvaluation.current) {
hasRunInitialEvaluation.current = true;
void runEvaluation(false);
}
}, [open, runEvaluation]);
useEffect(() => {
if (!open) {
setResult(null);
setError(null);
hasRunInitialEvaluation.current = false;
}
}, [open]);
return (
<Dialog open={open} onOpenChange={onOpenChange}>
<DialogContent className="sm:max-w-md">
<DialogHeader>
<DialogTitle>{t("title")}</DialogTitle>
<DialogDescription>{t("description")}</DialogDescription>
</DialogHeader>
{loading && !result ? (
<div className="flex flex-col items-center justify-center py-8">
<Loader2 className="h-8 w-8 animate-spin text-blue-500" />
<p className="text-muted-foreground mt-4 text-sm">
{t("evaluating")}
</p>
</div>
) : error ? (
<div className="py-4 text-center text-red-500">{error}</div>
) : result ? (
<div className="space-y-6">
{/* Grade and Score */}
<div className="flex items-center gap-6">
<GradeBadge grade={result.grade} />
<div>
<div className="text-3xl font-bold">{result.score}/10</div>
<div className="text-muted-foreground text-sm">
{t("overallScore")}
</div>
</div>
</div>
{/* Metrics */}
<div className="space-y-3">
<h4 className="text-sm font-medium">{t("metrics")}</h4>
<div className="bg-muted/50 space-y-2 rounded-lg p-3">
<MetricItem
icon={FileText}
label={t("wordCount")}
value={result.metrics.word_count.toLocaleString()}
/>
<MetricItem
icon={Link2}
label={t("citations")}
value={result.metrics.citation_count}
/>
<MetricItem
icon={BookOpen}
label={t("sources")}
value={result.metrics.unique_sources}
/>
<MetricItem
icon={Image}
label={t("images")}
value={result.metrics.image_count}
/>
<div className="pt-2">
<div className="mb-1 flex items-center justify-between text-sm">
<span className="text-muted-foreground">
{t("sectionCoverage")}
</span>
<span className="font-medium">
{Math.round(result.metrics.section_coverage_score * 100)}%
</span>
</div>
<Progress
value={result.metrics.section_coverage_score * 100}
className="h-2"
/>
</div>
</div>
</div>
{/* LLM Evaluation Results */}
{result.llm_evaluation && (
<div className="space-y-3">
<h4 className="text-sm font-medium">{t("detailedAnalysis")}</h4>
{/* LLM Scores */}
<div className="bg-muted/50 grid grid-cols-2 gap-2 rounded-lg p-3 text-sm">
{Object.entries(result.llm_evaluation.scores).map(
([key, value]) => (
<div key={key} className="flex justify-between">
<span className="text-muted-foreground">
{t(`scores.${key}`)}
</span>
<span className="font-medium">{value}/10</span>
</div>
),
)}
</div>
{/* Strengths */}
{result.llm_evaluation.strengths.length > 0 && (
<div className="space-y-2">
<div className="flex items-center gap-2 text-sm font-medium text-emerald-600">
<ThumbsUp className="h-4 w-4" />
{t("strengths")}
</div>
<ul className="space-y-1 text-sm">
{result.llm_evaluation.strengths
.slice(0, 3)
.map((s, i) => (
<li key={i} className="text-muted-foreground">
{s}
</li>
))}
</ul>
</div>
)}
{/* Weaknesses */}
{result.llm_evaluation.weaknesses.length > 0 && (
<div className="space-y-2">
<div className="flex items-center gap-2 text-sm font-medium text-orange-600">
<ThumbsDown className="h-4 w-4" />
{t("weaknesses")}
</div>
<ul className="space-y-1 text-sm">
{result.llm_evaluation.weaknesses
.slice(0, 3)
.map((w, i) => (
<li key={i} className="text-muted-foreground">
{w}
</li>
))}
</ul>
</div>
)}
</div>
)}
{/* Deep Evaluation Button */}
{!result.llm_evaluation && (
<Button
variant="outline"
className="w-full"
onClick={() => runEvaluation(true)}
disabled={deepLoading}
>
{deepLoading ? (
<>
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
{t("analyzing")}
</>
) : (
<>
<Sparkles className="mr-2 h-4 w-4" />
{t("deepEvaluation")}
</>
)}
</Button>
)}
</div>
) : null}
</DialogContent>
</Dialog>
);
}

View File

@@ -16,6 +16,7 @@ import { jsPDF } from "jspdf";
import {
Check,
Copy,
GraduationCap,
Headphones,
Pencil,
Undo2,
@@ -43,9 +44,10 @@ import {
} from "~/components/ui/dropdown-menu";
import { Tabs, TabsContent, TabsList, TabsTrigger } from "~/components/ui/tabs";
import { useReplay } from "~/core/replay";
import { closeResearch, listenToPodcast, useStore } from "~/core/store";
import { closeResearch, getResearchQuery, listenToPodcast, useStore, useSettingsStore } from "~/core/store";
import { cn } from "~/lib/utils";
import { EvaluationDialog } from "./evaluation-dialog";
import { ResearchActivitiesBlock } from "./research-activities-block";
import { ResearchReportBlock } from "./research-report-block";
@@ -84,6 +86,7 @@ export function ResearchBlock({
const [editing, setEditing] = useState(false);
const [isDownloading, setIsDownloading] = useState(false);
const [copied, setCopied] = useState(false);
const [showEvaluation, setShowEvaluation] = useState(false);
const handleCopy = useCallback(() => {
if (!reportId) {
return;
@@ -676,6 +679,16 @@ ${htmlContent}
{copied ? <Check /> : <Copy />}
</Button>
</Tooltip>
<Tooltip title={t("evaluateReport")}>
<Button
className="text-gray-400"
size="icon"
variant="ghost"
onClick={() => setShowEvaluation(true)}
>
<GraduationCap />
</Button>
</Tooltip>
<DropdownMenu>
<Tooltip title={t("downloadReport")}>
<DropdownMenuTrigger asChild>
@@ -796,6 +809,19 @@ ${htmlContent}
</TabsContent>
</Tabs>
</Card>
{/* Evaluation Dialog */}
{reportId && researchId && (
<EvaluationDialog
open={showEvaluation}
onOpenChange={setShowEvaluation}
reportContent={
useStore.getState().messages.get(reportId)?.content ?? ""
}
query={getResearchQuery(researchId)}
reportStyle={useSettingsStore.getState().general.reportStyle.toLowerCase()}
/>
)}
</div>
);
}

View File

@@ -0,0 +1,30 @@
"use client"
import * as React from "react"
import { cn } from "~/lib/utils"
interface ProgressProps extends React.HTMLAttributes<HTMLDivElement> {
value?: number
}
function Progress({ className, value = 0, ...props }: ProgressProps) {
return (
<div
data-slot="progress"
className={cn(
"bg-primary/20 relative h-2 w-full overflow-hidden rounded-full",
className
)}
{...props}
>
<div
data-slot="progress-indicator"
className="bg-primary h-full transition-all duration-300 ease-in-out"
style={{ width: `${Math.min(100, Math.max(0, value))}%` }}
/>
</div>
)
}
export { Progress }

View File

@@ -0,0 +1,91 @@
// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
// SPDX-License-Identifier: MIT
import { resolveServiceURL } from "./resolve-service-url";
/**
* Report evaluation API client.
*/
export interface EvaluationMetrics {
word_count: number;
citation_count: number;
unique_sources: number;
image_count: number;
section_count: number;
section_coverage_score: number;
sections_found: string[];
sections_missing: string[];
has_title: boolean;
has_key_points: boolean;
has_overview: boolean;
has_citations_section: boolean;
}
export interface LLMEvaluationScores {
factual_accuracy: number;
completeness: number;
coherence: number;
relevance: number;
citation_quality: number;
writing_quality: number;
}
export interface LLMEvaluation {
scores: LLMEvaluationScores;
overall_score: number;
weighted_score: number;
strengths: string[];
weaknesses: string[];
suggestions: string[];
}
export interface EvaluationResult {
metrics: EvaluationMetrics;
score: number;
grade: string;
llm_evaluation?: LLMEvaluation;
summary?: string;
}
export interface EvaluateReportRequest {
content: string;
query: string;
report_style?: string;
use_llm?: boolean;
}
/**
* Evaluate a report's quality using automated metrics and optionally LLM-as-Judge.
*
* @param content - Report markdown content
* @param query - Original research query
* @param reportStyle - Report style (academic, news, etc.)
* @param useLlm - Whether to use LLM for deep evaluation
* @returns Evaluation result with metrics, score, and grade
*/
export async function evaluateReport(
content: string,
query: string,
reportStyle?: string,
useLlm?: boolean,
): Promise<EvaluationResult> {
const response = await fetch(resolveServiceURL("report/evaluate"), {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
content,
query,
report_style: reportStyle ?? "default",
use_llm: useLlm ?? false,
} satisfies EvaluateReportRequest),
});
if (!response.ok) {
throw new Error(`Evaluation failed: ${response.statusText}`);
}
return response.json();
}

View File

@@ -2,6 +2,7 @@
// SPDX-License-Identifier: MIT
export * from "./chat";
export * from "./evaluate";
export * from "./mcp";
export * from "./podcast";
export * from "./prompt-enhancer";

View File

@@ -24,6 +24,7 @@ export const useStore = create<{
researchPlanIds: Map<string, string>;
researchReportIds: Map<string, string>;
researchActivityIds: Map<string, string[]>;
researchQueries: Map<string, string>;
ongoingResearchId: string | null;
openResearchId: string | null;
@@ -42,6 +43,7 @@ export const useStore = create<{
researchPlanIds: new Map<string, string>(),
researchReportIds: new Map<string, string>(),
researchActivityIds: new Map<string, string[]>(),
researchQueries: new Map<string, string>(),
ongoingResearchId: null,
openResearchId: null,
@@ -267,11 +269,17 @@ function getOngoingResearchId() {
function appendResearch(researchId: string) {
let planMessage: Message | undefined;
let userQuery: string | undefined;
const reversedMessageIds = [...useStore.getState().messageIds].reverse();
for (const messageId of reversedMessageIds) {
const message = getMessage(messageId);
if (message?.agent === "planner") {
if (!planMessage && message?.agent === "planner") {
planMessage = message;
}
if (!userQuery && message?.role === "user") {
userQuery = message.content;
}
if (planMessage && userQuery) {
break;
}
}
@@ -288,6 +296,10 @@ function appendResearch(researchId: string) {
researchId,
messageIds,
),
researchQueries: new Map(useStore.getState().researchQueries).set(
researchId,
userQuery ?? "",
),
});
}
@@ -394,6 +406,10 @@ export function useResearchMessage(researchId: string) {
);
}
export function getResearchQuery(researchId: string): string {
return useStore.getState().researchQueries.get(researchId) ?? "";
}
export function useMessage(messageId: string | null | undefined) {
return useStore(
useShallow((state) =>