feat(eval): add report quality evaluation module and UI integration (#776)

* feat(eval): add report quality evaluation module Addresses issue #773 - How to evaluate generated report quality objectively. This module provides two evaluation approaches: 1. Automated metrics (no LLM required): - Citation count and source diversity - Word count compliance per report style - Section structure validation - Image inclusion tracking 2. LLM-as-Judge evaluation: - Factual accuracy scoring - Completeness assessment - Coherence evaluation - Relevance and citation quality checks The combined evaluator provides a final score (1-10) and letter grade (A+ to F). Files added: - src/eval/__init__.py - src/eval/metrics.py - src/eval/llm_judge.py - src/eval/evaluator.py - tests/unit/eval/test_metrics.py - tests/unit/eval/test_evaluator.py * feat(eval): integrate report evaluation with web UI This commit adds the web UI integration for the evaluation module: Backend: - Add EvaluateReportRequest/Response models in src/server/eval_request.py - Add /api/report/evaluate endpoint to src/server/app.py Frontend: - Add evaluateReport API function in web/src/core/api/evaluate.ts - Create EvaluationDialog component with grade badge, metrics display, and optional LLM deep evaluation - Add evaluation button (graduation cap icon) to research-block.tsx toolbar - Add i18n translations for English and Chinese The evaluation UI allows users to: 1. View quick metrics-only evaluation (instant) 2. Optionally run deep LLM-based evaluation for detailed analysis 3. See grade (A+ to F), score (1-10), and metric breakdown * feat(eval): improve evaluation reliability and add LLM judge tests - Extract MAX_REPORT_LENGTH constant in llm_judge.py for maintainability - Add comprehensive unit tests for LLMJudge class (parse_response, calculate_weighted_score, evaluate with mocked LLM) - Pass reportStyle prop to EvaluationDialog for accurate evaluation criteria - Add researchQueries store map to reliably associate queries with research - Add getResearchQuery helper to retrieve query by researchId - Remove unused imports in test_metrics.py * fix(eval): use resolveServiceURL for evaluate API endpoint The evaluateReport function was using a relative URL '/api/report/evaluate' which sent requests to the Next.js server instead of the FastAPI backend. Changed to use resolveServiceURL() consistent with other API functions. * fix: improve type accuracy and React hooks in evaluation components - Fix get_word_count_target return type from Optional[Dict] to Dict since it always returns a value via default fallback - Fix useEffect dependency issue in EvaluationDialog using useRef to prevent unwanted re-evaluations - Add aria-label to GradeBadge for screen reader accessibility
2026-04-13 10:24:44 +08:00 · 2025-12-25 21:55:48 +08:00
parent 84a7f7815c
commit 8d9d767051
17 changed files with 2103 additions and 2 deletions
--- a/web/src/app/chat/components/evaluation-dialog.tsx
+++ b/web/src/app/chat/components/evaluation-dialog.tsx
@@ -0,0 +1,300 @@
+// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+// SPDX-License-Identifier: MIT
+
+import {
+  BookOpen,
+  FileText,
+  Image,
+  Link2,
+  Loader2,
+  Sparkles,
+  ThumbsDown,
+  ThumbsUp,
+} from "lucide-react";
+import { useTranslations } from "next-intl";
+import { useCallback, useEffect, useRef, useState } from "react";
+
+import { Button } from "~/components/ui/button";
+import {
+  Dialog,
+  DialogContent,
+  DialogDescription,
+  DialogHeader,
+  DialogTitle,
+} from "~/components/ui/dialog";
+import { Progress } from "~/components/ui/progress";
+import { evaluateReport, type EvaluationResult } from "~/core/api";
+import { cn } from "~/lib/utils";
+
+interface EvaluationDialogProps {
+  open: boolean;
+  onOpenChange: (open: boolean) => void;
+  reportContent: string;
+  query: string;
+  reportStyle?: string;
+}
+
+function GradeBadge({ grade }: { grade: string }) {
+  const gradeColors: Record<string, string> = {
+    "A+": "bg-emerald-500",
+    A: "bg-emerald-500",
+    "A-": "bg-emerald-400",
+    "B+": "bg-blue-500",
+    B: "bg-blue-500",
+    "B-": "bg-blue-400",
+    "C+": "bg-yellow-500",
+    C: "bg-yellow-500",
+    "C-": "bg-yellow-400",
+    D: "bg-orange-500",
+    F: "bg-red-500",
+  };
+
+  return (
+    <div
+      aria-label={`Report grade: ${grade}`}
+      className={cn(
+        "flex h-16 w-16 items-center justify-center rounded-full text-2xl font-bold text-white",
+        gradeColors[grade] ?? "bg-gray-500",
+      )}
+    >
+      {grade}
+    </div>
+  );
+}
+
+function MetricItem({
+  icon: Icon,
+  label,
+  value,
+  suffix,
+}: {
+  icon: React.ComponentType<{ className?: string }>;
+  label: string;
+  value: number | string;
+  suffix?: string;
+}) {
+  return (
+    <div className="flex items-center gap-3">
+      <Icon className="text-muted-foreground h-4 w-4" />
+      <span className="text-muted-foreground text-sm">{label}</span>
+      <span className="ml-auto font-medium">
+        {value}
+        {suffix}
+      </span>
+    </div>
+  );
+}
+
+export function EvaluationDialog({
+  open,
+  onOpenChange,
+  reportContent,
+  query,
+  reportStyle,
+}: EvaluationDialogProps) {
+  const t = useTranslations("chat.evaluation");
+  const [loading, setLoading] = useState(false);
+  const [deepLoading, setDeepLoading] = useState(false);
+  const [result, setResult] = useState<EvaluationResult | null>(null);
+  const [error, setError] = useState<string | null>(null);
+  const hasRunInitialEvaluation = useRef(false);
+
+  const runEvaluation = useCallback(
+    async (useLlm: boolean) => {
+      if (useLlm) {
+        setDeepLoading(true);
+      } else {
+        setLoading(true);
+      }
+      setError(null);
+
+      try {
+        const evalResult = await evaluateReport(
+          reportContent,
+          query,
+          reportStyle,
+          useLlm,
+        );
+        setResult(evalResult);
+      } catch (err) {
+        setError(err instanceof Error ? err.message : "Evaluation failed");
+      } finally {
+        setLoading(false);
+        setDeepLoading(false);
+      }
+    },
+    [reportContent, query, reportStyle],
+  );
+
+  useEffect(() => {
+    if (open && !hasRunInitialEvaluation.current) {
+      hasRunInitialEvaluation.current = true;
+      void runEvaluation(false);
+    }
+  }, [open, runEvaluation]);
+
+  useEffect(() => {
+    if (!open) {
+      setResult(null);
+      setError(null);
+      hasRunInitialEvaluation.current = false;
+    }
+  }, [open]);
+
+  return (
+    <Dialog open={open} onOpenChange={onOpenChange}>
+      <DialogContent className="sm:max-w-md">
+        <DialogHeader>
+          <DialogTitle>{t("title")}</DialogTitle>
+          <DialogDescription>{t("description")}</DialogDescription>
+        </DialogHeader>
+
+        {loading && !result ? (
+          <div className="flex flex-col items-center justify-center py-8">
+            <Loader2 className="h-8 w-8 animate-spin text-blue-500" />
+            <p className="text-muted-foreground mt-4 text-sm">
+              {t("evaluating")}
+            </p>
+          </div>
+        ) : error ? (
+          <div className="py-4 text-center text-red-500">{error}</div>
+        ) : result ? (
+          <div className="space-y-6">
+            {/* Grade and Score */}
+            <div className="flex items-center gap-6">
+              <GradeBadge grade={result.grade} />
+              <div>
+                <div className="text-3xl font-bold">{result.score}/10</div>
+                <div className="text-muted-foreground text-sm">
+                  {t("overallScore")}
+                </div>
+              </div>
+            </div>
+
+            {/* Metrics */}
+            <div className="space-y-3">
+              <h4 className="text-sm font-medium">{t("metrics")}</h4>
+              <div className="bg-muted/50 space-y-2 rounded-lg p-3">
+                <MetricItem
+                  icon={FileText}
+                  label={t("wordCount")}
+                  value={result.metrics.word_count.toLocaleString()}
+                />
+                <MetricItem
+                  icon={Link2}
+                  label={t("citations")}
+                  value={result.metrics.citation_count}
+                />
+                <MetricItem
+                  icon={BookOpen}
+                  label={t("sources")}
+                  value={result.metrics.unique_sources}
+                />
+                <MetricItem
+                  icon={Image}
+                  label={t("images")}
+                  value={result.metrics.image_count}
+                />
+                <div className="pt-2">
+                  <div className="mb-1 flex items-center justify-between text-sm">
+                    <span className="text-muted-foreground">
+                      {t("sectionCoverage")}
+                    </span>
+                    <span className="font-medium">
+                      {Math.round(result.metrics.section_coverage_score * 100)}%
+                    </span>
+                  </div>
+                  <Progress
+                    value={result.metrics.section_coverage_score * 100}
+                    className="h-2"
+                  />
+                </div>
+              </div>
+            </div>
+
+            {/* LLM Evaluation Results */}
+            {result.llm_evaluation && (
+              <div className="space-y-3">
+                <h4 className="text-sm font-medium">{t("detailedAnalysis")}</h4>
+
+                {/* LLM Scores */}
+                <div className="bg-muted/50 grid grid-cols-2 gap-2 rounded-lg p-3 text-sm">
+                  {Object.entries(result.llm_evaluation.scores).map(
+                    ([key, value]) => (
+                      <div key={key} className="flex justify-between">
+                        <span className="text-muted-foreground">
+                          {t(`scores.${key}`)}
+                        </span>
+                        <span className="font-medium">{value}/10</span>
+                      </div>
+                    ),
+                  )}
+                </div>
+
+                {/* Strengths */}
+                {result.llm_evaluation.strengths.length > 0 && (
+                  <div className="space-y-2">
+                    <div className="flex items-center gap-2 text-sm font-medium text-emerald-600">
+                      <ThumbsUp className="h-4 w-4" />
+                      {t("strengths")}
+                    </div>
+                    <ul className="space-y-1 text-sm">
+                      {result.llm_evaluation.strengths
+                        .slice(0, 3)
+                        .map((s, i) => (
+                          <li key={i} className="text-muted-foreground">
+                            • {s}
+                          </li>
+                        ))}
+                    </ul>
+                  </div>
+                )}
+
+                {/* Weaknesses */}
+                {result.llm_evaluation.weaknesses.length > 0 && (
+                  <div className="space-y-2">
+                    <div className="flex items-center gap-2 text-sm font-medium text-orange-600">
+                      <ThumbsDown className="h-4 w-4" />
+                      {t("weaknesses")}
+                    </div>
+                    <ul className="space-y-1 text-sm">
+                      {result.llm_evaluation.weaknesses
+                        .slice(0, 3)
+                        .map((w, i) => (
+                          <li key={i} className="text-muted-foreground">
+                            • {w}
+                          </li>
+                        ))}
+                    </ul>
+                  </div>
+                )}
+              </div>
+            )}
+
+            {/* Deep Evaluation Button */}
+            {!result.llm_evaluation && (
+              <Button
+                variant="outline"
+                className="w-full"
+                onClick={() => runEvaluation(true)}
+                disabled={deepLoading}
+              >
+                {deepLoading ? (
+                  <>
+                    <Loader2 className="mr-2 h-4 w-4 animate-spin" />
+                    {t("analyzing")}
+                  </>
+                ) : (
+                  <>
+                    <Sparkles className="mr-2 h-4 w-4" />
+                    {t("deepEvaluation")}
+                  </>
+                )}
+              </Button>
+            )}
+          </div>
+        ) : null}
+      </DialogContent>
+    </Dialog>
+  );
+}
--- a/web/src/app/chat/components/research-block.tsx
+++ b/web/src/app/chat/components/research-block.tsx
@@ -16,6 +16,7 @@ import { jsPDF } from "jspdf";
 import {
  Check,
  Copy,
+  GraduationCap,
  Headphones,
  Pencil,
  Undo2,
@@ -43,9 +44,10 @@ import {
 } from "~/components/ui/dropdown-menu";
 import { Tabs, TabsContent, TabsList, TabsTrigger } from "~/components/ui/tabs";
 import { useReplay } from "~/core/replay";
-import { closeResearch, listenToPodcast, useStore } from "~/core/store";
+import { closeResearch, getResearchQuery, listenToPodcast, useStore, useSettingsStore } from "~/core/store";
 import { cn } from "~/lib/utils";

+import { EvaluationDialog } from "./evaluation-dialog";
 import { ResearchActivitiesBlock } from "./research-activities-block";
 import { ResearchReportBlock } from "./research-report-block";

@@ -84,6 +86,7 @@ export function ResearchBlock({
  const [editing, setEditing] = useState(false);
  const [isDownloading, setIsDownloading] = useState(false);
  const [copied, setCopied] = useState(false);
+  const [showEvaluation, setShowEvaluation] = useState(false);
  const handleCopy = useCallback(() => {
    if (!reportId) {
      return;
@@ -676,6 +679,16 @@ ${htmlContent}
                  {copied ? <Check /> : <Copy />}
                </Button>
              </Tooltip>
+              <Tooltip title={t("evaluateReport")}>
+                <Button
+                  className="text-gray-400"
+                  size="icon"
+                  variant="ghost"
+                  onClick={() => setShowEvaluation(true)}
+                >
+                  <GraduationCap />
+                </Button>
+              </Tooltip>
              <DropdownMenu>
                <Tooltip title={t("downloadReport")}>
                  <DropdownMenuTrigger asChild>
@@ -796,6 +809,19 @@ ${htmlContent}
          </TabsContent>
        </Tabs>
      </Card>
+
+      {/* Evaluation Dialog */}
+      {reportId && researchId && (
+        <EvaluationDialog
+          open={showEvaluation}
+          onOpenChange={setShowEvaluation}
+          reportContent={
+            useStore.getState().messages.get(reportId)?.content ?? ""
+          }
+          query={getResearchQuery(researchId)}
+          reportStyle={useSettingsStore.getState().general.reportStyle.toLowerCase()}
+        />
+      )}
    </div>
  );
 }
--- a/web/src/components/ui/progress.tsx
+++ b/web/src/components/ui/progress.tsx
@@ -0,0 +1,30 @@
+"use client"
+
+import * as React from "react"
+
+import { cn } from "~/lib/utils"
+
+interface ProgressProps extends React.HTMLAttributes<HTMLDivElement> {
+  value?: number
+}
+
+function Progress({ className, value = 0, ...props }: ProgressProps) {
+  return (
+    <div
+      data-slot="progress"
+      className={cn(
+        "bg-primary/20 relative h-2 w-full overflow-hidden rounded-full",
+        className
+      )}
+      {...props}
+    >
+      <div
+        data-slot="progress-indicator"
+        className="bg-primary h-full transition-all duration-300 ease-in-out"
+        style={{ width: `${Math.min(100, Math.max(0, value))}%` }}
+      />
+    </div>
+  )
+}
+
+export { Progress }
--- a/web/src/core/api/evaluate.ts
+++ b/web/src/core/api/evaluate.ts
@@ -0,0 +1,91 @@
+// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+// SPDX-License-Identifier: MIT
+
+import { resolveServiceURL } from "./resolve-service-url";
+
+/**
+ * Report evaluation API client.
+ */
+
+export interface EvaluationMetrics {
+  word_count: number;
+  citation_count: number;
+  unique_sources: number;
+  image_count: number;
+  section_count: number;
+  section_coverage_score: number;
+  sections_found: string[];
+  sections_missing: string[];
+  has_title: boolean;
+  has_key_points: boolean;
+  has_overview: boolean;
+  has_citations_section: boolean;
+}
+
+export interface LLMEvaluationScores {
+  factual_accuracy: number;
+  completeness: number;
+  coherence: number;
+  relevance: number;
+  citation_quality: number;
+  writing_quality: number;
+}
+
+export interface LLMEvaluation {
+  scores: LLMEvaluationScores;
+  overall_score: number;
+  weighted_score: number;
+  strengths: string[];
+  weaknesses: string[];
+  suggestions: string[];
+}
+
+export interface EvaluationResult {
+  metrics: EvaluationMetrics;
+  score: number;
+  grade: string;
+  llm_evaluation?: LLMEvaluation;
+  summary?: string;
+}
+
+export interface EvaluateReportRequest {
+  content: string;
+  query: string;
+  report_style?: string;
+  use_llm?: boolean;
+}
+
+/**
+ * Evaluate a report's quality using automated metrics and optionally LLM-as-Judge.
+ *
+ * @param content - Report markdown content
+ * @param query - Original research query
+ * @param reportStyle - Report style (academic, news, etc.)
+ * @param useLlm - Whether to use LLM for deep evaluation
+ * @returns Evaluation result with metrics, score, and grade
+ */
+export async function evaluateReport(
+  content: string,
+  query: string,
+  reportStyle?: string,
+  useLlm?: boolean,
+): Promise<EvaluationResult> {
+  const response = await fetch(resolveServiceURL("report/evaluate"), {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({
+      content,
+      query,
+      report_style: reportStyle ?? "default",
+      use_llm: useLlm ?? false,
+    } satisfies EvaluateReportRequest),
+  });
+
+  if (!response.ok) {
+    throw new Error(`Evaluation failed: ${response.statusText}`);
+  }
+
+  return response.json();
+}
--- a/web/src/core/api/index.ts
+++ b/web/src/core/api/index.ts
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: MIT

 export * from "./chat";
+export * from "./evaluate";
 export * from "./mcp";
 export * from "./podcast";
 export * from "./prompt-enhancer";
--- a/web/src/core/store/store.ts
+++ b/web/src/core/store/store.ts
@@ -24,6 +24,7 @@ export const useStore = create<{
  researchPlanIds: Map<string, string>;
  researchReportIds: Map<string, string>;
  researchActivityIds: Map<string, string[]>;
+  researchQueries: Map<string, string>;
  ongoingResearchId: string | null;
  openResearchId: string | null;

@@ -42,6 +43,7 @@ export const useStore = create<{
  researchPlanIds: new Map<string, string>(),
  researchReportIds: new Map<string, string>(),
  researchActivityIds: new Map<string, string[]>(),
+  researchQueries: new Map<string, string>(),
  ongoingResearchId: null,
  openResearchId: null,

@@ -267,11 +269,17 @@ function getOngoingResearchId() {

 function appendResearch(researchId: string) {
  let planMessage: Message | undefined;
+  let userQuery: string | undefined;
  const reversedMessageIds = [...useStore.getState().messageIds].reverse();
  for (const messageId of reversedMessageIds) {
    const message = getMessage(messageId);
-    if (message?.agent === "planner") {
+    if (!planMessage && message?.agent === "planner") {
      planMessage = message;
+    }
+    if (!userQuery && message?.role === "user") {
+      userQuery = message.content;
+    }
+    if (planMessage && userQuery) {
      break;
    }
  }
@@ -288,6 +296,10 @@ function appendResearch(researchId: string) {
      researchId,
      messageIds,
    ),
+    researchQueries: new Map(useStore.getState().researchQueries).set(
+      researchId,
+      userQuery ?? "",
+    ),
  });
 }

@@ -394,6 +406,10 @@ export function useResearchMessage(researchId: string) {
  );
 }

+export function getResearchQuery(researchId: string): string {
+  return useStore.getState().researchQueries.get(researchId) ?? "";
+}
+
 export function useMessage(messageId: string | null | undefined) {
  return useStore(
    useShallow((state) =>