From b7f0f54aa0243e28563c160ab7eb3ebfbb11a339 Mon Sep 17 00:00:00 2001 From: LoftyComet <72645707+LoftyComet@users.noreply.github.com> Date: Sat, 24 Jan 2026 17:49:13 +0800 Subject: [PATCH] feat: add citation support in research report block and markdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add citation support in research report block and markdown - Enhanced ResearchReportBlock to fetch citations based on researchId and pass them to the Markdown component. - Introduced CitationLink component to display citation metadata on hover for links in markdown. - Implemented CitationCard and CitationList components for displaying citation details and lists. - Updated Markdown component to handle citation links and inline citations. - Created HoverCard component for displaying citation information in a tooltip-like manner. - Modified store to manage citations, including setting and retrieving citations for ongoing research. - Added CitationsEvent type to handle citations in chat events and updated Message type to include citations. * fix(log): Enable the logging level when enabling the DEBUG environment variable (#793) * fix(frontend): render all tool calls in the frontend #796 (#797) * build(deps): bump jspdf from 3.0.4 to 4.0.0 in /web (#798) Bumps [jspdf](https://github.com/parallax/jsPDF) from 3.0.4 to 4.0.0. - [Release notes](https://github.com/parallax/jsPDF/releases) - [Changelog](https://github.com/parallax/jsPDF/blob/master/RELEASE.md) - [Commits](https://github.com/parallax/jsPDF/compare/v3.0.4...v4.0.0) --- updated-dependencies: - dependency-name: jspdf dependency-version: 4.0.0 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * fix(frontend):added the display of the 'analyst' message #800 (#801) * fix: migrate from deprecated create_react_agent to langchain.agents.create_agent (#802) * fix: migrate from deprecated create_react_agent to langchain.agents.create_agent Fixes #799 - Replace deprecated langgraph.prebuilt.create_react_agent with langchain.agents.create_agent (LangGraph 1.0 migration) - Add DynamicPromptMiddleware to handle dynamic prompt templates (replaces the 'prompt' callable parameter) - Add PreModelHookMiddleware to handle pre-model hooks (replaces the 'pre_model_hook' parameter) - Update AgentState import from langchain.agents in template.py - Update tests to use the new API * fix:update the code with review comments * fix: Add runtime parameter to compress_messages method(#803) * fix: Add runtime parameter to compress_messages method(#803) The compress_messages method was being called by PreModelHookMiddleware with both state and runtime parameters, but only accepted state parameter. This caused a TypeError when the middleware executed the pre_model_hook. Added optional runtime parameter to compress_messages signature to match the expected interface while maintaining backward compatibility. * Update the code with the review comments * fix: Refactor citation handling and add comprehensive tests for citation features * refactor: Clean up imports and formatting across citation modules * fix: Add monkeypatch to clear AGENT_RECURSION_LIMIT in recursion limit tests * feat: Enhance citation link handling in Markdown component * fix: Exclude citations from finish reason handling in mergeMessage function * fix(nodes): update message handling * fix(citations): improve citation extraction and handling in event processing * feat(citations): enhance citation extraction and handling with improved merging and normalization * fix(reporter): update citation formatting instructions for clarity and consistency * fix(reporter): prioritize using Markdown tables for data presentation and comparison --------- Signed-off-by: dependabot[bot] Co-authored-by: LoftyComet <1277173875@qq。> Co-authored-by: Willem Jiang Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- src/citations/__init__.py | 28 ++ src/citations/collector.py | 280 ++++++++++++++ src/citations/extractor.py | 343 ++++++++++++++++++ src/citations/formatter.py | 271 ++++++++++++++ src/citations/models.py | 178 +++++++++ src/graph/nodes.py | 52 ++- src/graph/types.py | 5 + src/prompts/reporter.md | 9 +- src/prompts/reporter.zh_CN.md | 9 +- src/server/app.py | 108 ++++++ tests/unit/citations/test_citations.py | 136 +++++++ tests/unit/config/test_configuration.py | 7 +- web/package.json | 1 + web/pnpm-lock.yaml | 201 ++++++++++ .../chat/components/research-report-block.tsx | 6 +- web/src/components/deer-flow/citation.tsx | 308 ++++++++++++++++ web/src/components/deer-flow/markdown.tsx | 120 +++++- web/src/components/ui/hover-card.tsx | 34 ++ web/src/core/api/types.ts | 13 +- web/src/core/messages/merge-message.ts | 2 +- web/src/core/messages/types.ts | 12 + web/src/core/store/store.ts | 31 +- 22 files changed, 2125 insertions(+), 29 deletions(-) create mode 100644 src/citations/__init__.py create mode 100644 src/citations/collector.py create mode 100644 src/citations/extractor.py create mode 100644 src/citations/formatter.py create mode 100644 src/citations/models.py create mode 100644 tests/unit/citations/test_citations.py create mode 100644 web/src/components/deer-flow/citation.tsx create mode 100644 web/src/components/ui/hover-card.tsx diff --git a/src/citations/__init__.py b/src/citations/__init__.py new file mode 100644 index 0000000..c70df25 --- /dev/null +++ b/src/citations/__init__.py @@ -0,0 +1,28 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +""" +Citation management module for DeerFlow. + +This module provides structured citation/source metadata handling +for research reports, enabling proper attribution and inline citations. +""" + +from .collector import CitationCollector +from .extractor import ( + citations_to_markdown_references, + extract_citations_from_messages, + merge_citations, +) +from .formatter import CitationFormatter +from .models import Citation, CitationMetadata + +__all__ = [ + "Citation", + "CitationMetadata", + "CitationCollector", + "CitationFormatter", + "extract_citations_from_messages", + "merge_citations", + "citations_to_markdown_references", +] diff --git a/src/citations/collector.py b/src/citations/collector.py new file mode 100644 index 0000000..db63702 --- /dev/null +++ b/src/citations/collector.py @@ -0,0 +1,280 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +""" +Citation collector for gathering and managing citations during research. +""" + +import logging +from typing import Any, Dict, List, Optional + +from .models import Citation, CitationMetadata + +logger = logging.getLogger(__name__) + + +class CitationCollector: + """ + Collects and manages citations during the research process. + + This class handles: + - Collecting citations from search results and crawled pages + - Deduplicating citations by URL + - Assigning citation numbers + - Tracking which citations are actually used in the report + """ + + def __init__(self): + self._citations: Dict[str, CitationMetadata] = {} # url -> metadata + self._citation_order: List[str] = [] # ordered list of URLs + self._used_citations: set[str] = set() # URLs that are actually cited + + def add_from_search_results( + self, results: List[Dict[str, Any]], query: str = "" + ) -> List[CitationMetadata]: + """ + Add citations from search results. + + Args: + results: List of search result dictionaries + query: The search query that produced these results + + Returns: + List of CitationMetadata objects that were added + """ + added = [] + for result in results: + # Skip image results + if result.get("type") == "image_url": + continue + + url = result.get("url") + if not url: + continue + + # Create or update citation metadata + metadata = CitationMetadata.from_search_result(result, query) + + if url not in self._citations: + self._citations[url] = metadata + self._citation_order.append(url) + added.append(metadata) + logger.debug(f"Added citation: {metadata.title} ({url})") + else: + # Update with potentially better metadata + existing = self._citations[url] + if metadata.relevance_score > existing.relevance_score: + self._citations[url] = metadata + logger.debug(f"Updated citation: {metadata.title} ({url})") + + return added + + def add_from_crawl_result( + self, url: str, title: str, content: Optional[str] = None, **extra_metadata + ) -> CitationMetadata: + """ + Add or update a citation from a crawled page. + + Args: + url: The URL of the crawled page + title: The page title + content: The page content + **extra_metadata: Additional metadata fields + + Returns: + The CitationMetadata object + """ + if url in self._citations: + # Update existing citation with crawled content + metadata = self._citations[url] + if title and title != "Untitled": + metadata.title = title + if content: + metadata.raw_content = content + if not metadata.content_snippet: + metadata.content_snippet = content[:500] + else: + # Create new citation + metadata = CitationMetadata( + url=url, + title=title or "Untitled", + content_snippet=content[:500] if content else None, + raw_content=content, + **extra_metadata, + ) + self._citations[url] = metadata + self._citation_order.append(url) + + return metadata + + def mark_used(self, url: str) -> Optional[int]: + """ + Mark a citation as used and return its number. + + Args: + url: The URL of the citation + + Returns: + The citation number (1-indexed) or None if not found + """ + if url in self._citations: + self._used_citations.add(url) + return self.get_number(url) + return None + + def get_number(self, url: str) -> Optional[int]: + """ + Get the citation number for a URL. + + Args: + url: The URL to look up + + Returns: + The citation number (1-indexed) or None if not found + """ + try: + return self._citation_order.index(url) + 1 + except ValueError: + return None + + def get_metadata(self, url: str) -> Optional[CitationMetadata]: + """ + Get the metadata for a URL. + + Args: + url: The URL to look up + + Returns: + The CitationMetadata or None if not found + """ + return self._citations.get(url) + + def get_all_citations(self) -> List[Citation]: + """ + Get all collected citations in order. + + Returns: + List of Citation objects + """ + citations = [] + for i, url in enumerate(self._citation_order): + metadata = self._citations[url] + citations.append( + Citation( + number=i + 1, + metadata=metadata, + ) + ) + return citations + + def get_used_citations(self) -> List[Citation]: + """ + Get only the citations that have been marked as used. + + Returns: + List of Citation objects that are actually used + """ + citations = [] + number = 1 + for url in self._citation_order: + if url in self._used_citations: + metadata = self._citations[url] + citations.append( + Citation( + number=number, + metadata=metadata, + ) + ) + number += 1 + return citations + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize the collector state to a dictionary. + + Returns: + Dictionary representation of the collector + """ + return { + "citations": [c.to_dict() for c in self.get_all_citations()], + "used_urls": list(self._used_citations), + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "CitationCollector": + """ + Deserialize a collector from a dictionary. + + Args: + data: Dictionary representation + + Returns: + CitationCollector instance + """ + collector = cls() + for citation_data in data.get("citations", []): + citation = Citation.from_dict(citation_data) + collector._citations[citation.url] = citation.metadata + collector._citation_order.append(citation.url) + collector._used_citations = set(data.get("used_urls", [])) + return collector + + def merge_with(self, other: "CitationCollector") -> None: + """ + Merge another collector's citations into this one. + + Args: + other: Another CitationCollector to merge + """ + for url in other._citation_order: + if url not in self._citations: + self._citations[url] = other._citations[url] + self._citation_order.append(url) + self._used_citations.update(other._used_citations) + + @property + def count(self) -> int: + """Return the total number of citations.""" + return len(self._citations) + + @property + def used_count(self) -> int: + """Return the number of used citations.""" + return len(self._used_citations) + + def clear(self) -> None: + """Clear all citations.""" + self._citations.clear() + self._citation_order.clear() + self._used_citations.clear() + + +def extract_urls_from_text(text: str) -> List[str]: + """ + Extract URLs from markdown text. + + Args: + text: Markdown text that may contain URLs + + Returns: + List of URLs found in the text + """ + import re + + urls = [] + + # Match markdown links: [text](url) + markdown_pattern = r"\[([^\]]+)\]\(([^)]+)\)" + for match in re.finditer(markdown_pattern, text): + url = match.group(2) + if url.startswith(("http://", "https://")): + urls.append(url) + + # Match bare URLs + bare_url_pattern = r"(?\]]+)" + for match in re.finditer(bare_url_pattern, text): + url = match.group(1) + if url not in urls: + urls.append(url) + + return urls diff --git a/src/citations/extractor.py b/src/citations/extractor.py new file mode 100644 index 0000000..477b1bc --- /dev/null +++ b/src/citations/extractor.py @@ -0,0 +1,343 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +""" +Citation extraction utilities for extracting citations from tool results. +""" + +import json +import logging +from typing import Any, Dict, List, Optional + +from langchain_core.messages import AIMessage, ToolMessage + +from .models import CitationMetadata + +logger = logging.getLogger(__name__) + + +def extract_citations_from_messages(messages: List[Any]) -> List[Dict[str, Any]]: + """ + Extract citation metadata from agent messages (tool calls/results). + + Args: + messages: List of messages from agent execution + + Returns: + List of citation dictionaries + """ + citations = [] + seen_urls = set() + + logger.info(f"[Citations] Starting extraction from {len(messages)} messages") + + for message in messages: + # Extract from ToolMessage results (web_search, crawl) + if isinstance(message, ToolMessage): + logger.info( + f"[Citations] Found ToolMessage: name={getattr(message, 'name', 'unknown')}" + ) + tool_citations = _extract_from_tool_message(message) + for citation in tool_citations: + url = citation.get("url", "") + if url and url not in seen_urls: + seen_urls.add(url) + citations.append(citation) + + # Also check AIMessage tool_calls for any embedded results + if isinstance(message, AIMessage) and hasattr(message, "tool_calls"): + for tool_call in message.tool_calls or []: + if tool_call.get("name") == "web_search": + # The query is in the args + query = tool_call.get("args", {}).get("query", "") + logger.info( + "[Citations] Found web_search tool call with query=%r", query + ) + # Note: results come in subsequent ToolMessage + + logger.info( + f"[Citations] Extracted {len(citations)} unique citations from {len(messages)} messages" + ) + return citations + + +def _extract_from_tool_message(message: ToolMessage) -> List[Dict[str, Any]]: + """ + Extract citations from a tool message result. + + Args: + message: ToolMessage with tool execution result + + Returns: + List of citation dictionaries + """ + citations = [] + tool_name = getattr(message, "name", "") or "" + content = getattr(message, "content", "") + + logger.info( + f"Processing tool message: tool_name='{tool_name}', content_len={len(str(content)) if content else 0}" + ) + + if not content: + return citations + + # Parse JSON content + try: + if isinstance(content, str): + data = json.loads(content) + else: + data = content + except (json.JSONDecodeError, TypeError): + logger.debug( + f"Could not parse tool message content as JSON: {str(content)[:100]}..." + ) + return citations + + logger.debug(f"Parsed tool message data type: {type(data).__name__}") + + # Try to detect content type by structure rather than just tool name + tool_name_lower = tool_name.lower() if tool_name else "" + + # Handle web_search results (by name or by structure) + if tool_name_lower in ( + "web_search", + "tavily_search", + "duckduckgo_search", + "brave_search", + "searx_search", + ): + citations.extend(_extract_from_search_results(data)) + logger.debug( + f"Extracted {len(citations)} citations from search tool '{tool_name}'" + ) + + # Handle crawl results (by name or by structure) + elif tool_name_lower in ("crawl_tool", "crawl", "jina_crawl"): + citation = _extract_from_crawl_result(data) + if citation: + citations.append(citation) + logger.debug(f"Extracted 1 citation from crawl tool '{tool_name}'") + + # Fallback: Try to detect by data structure + else: + # Check if it looks like search results (list of items with url) + if isinstance(data, list) and len(data) > 0: + first_item = data[0] + if isinstance(first_item, dict) and "url" in first_item: + logger.debug( + f"Auto-detected search results format for tool '{tool_name}'" + ) + citations.extend(_extract_from_search_results(data)) + # Check if it looks like crawl result (dict with url and crawled_content) + elif ( + isinstance(data, dict) + and "url" in data + and ("crawled_content" in data or "content" in data) + ): + logger.debug(f"Auto-detected crawl result format for tool '{tool_name}'") + citation = _extract_from_crawl_result(data) + if citation: + citations.append(citation) + + return citations + + +def _extract_from_search_results(data: Any) -> List[Dict[str, Any]]: + """ + Extract citations from web search results. + + Args: + data: Parsed JSON data from search tool + + Returns: + List of citation dictionaries + """ + citations = [] + + # Handle list of results + if isinstance(data, list): + for result in data: + if isinstance(result, dict) and result.get("type") != "image_url": + citation = _result_to_citation(result) + if citation: + citations.append(citation) + + # Handle dict with results key + elif isinstance(data, dict): + if "error" in data: + logger.warning(f"Search error: {data.get('error')}") + return citations + + results = data.get("results", []) + for result in results: + if isinstance(result, dict) and result.get("type") != "image_url": + citation = _result_to_citation(result) + if citation: + citations.append(citation) + + return citations + + +def _result_to_citation(result: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """ + Convert a search result to a citation dictionary. + + Args: + result: Search result dictionary + + Returns: + Citation dictionary or None + """ + url = result.get("url", "") + if not url: + return None + + return { + "url": url, + "title": result.get("title", "Untitled"), + "description": result.get("content", ""), + "content_snippet": (result.get("content", "") or "")[:500], + "relevance_score": result.get("score", 0.0), + "domain": _extract_domain(url), + "accessed_at": None, # Will be filled by CitationMetadata + "source_type": "web_search", + } + + +def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]: + """ + Extract citation from crawl tool result. + + Args: + data: Parsed JSON data from crawl tool + + Returns: + Citation dictionary or None + """ + if not isinstance(data, dict): + return None + + url = data.get("url", "") + if not url: + return None + + content = data.get("crawled_content", "") + + # Try to extract title from content (first h1 or first line) + title = "Untitled" + if content: + lines = content.strip().split("\n") + for line in lines: + line = line.strip() + if line.startswith("# "): + title = line[2:].strip() + break + elif line and not line.startswith("#"): + title = line[:100] + break + + return { + "url": url, + "title": title, + "description": content[:300] if content else "", + "content_snippet": content[:500] if content else "", + "raw_content": content, + "domain": _extract_domain(url), + "source_type": "crawl", + } + + +def _extract_domain(url: str) -> str: + """Extract domain from URL.""" + try: + from urllib.parse import urlparse + + parsed = urlparse(url) + return parsed.netloc + except Exception: + return "" + + +def merge_citations( + existing: List[Dict[str, Any]], new: List[Dict[str, Any]] +) -> List[Dict[str, Any]]: + """ + Merge new citations into existing list, avoiding duplicates. + + Args: + existing: Existing citations list + new: New citations to add + + Returns: + Merged list of citations + """ + seen_urls = {c.get("url") for c in existing if c.get("url")} + result = list(existing) + + for citation in new: + url = citation.get("url", "") + if url and url not in seen_urls: + seen_urls.add(url) + result.append(citation) + elif url in seen_urls: + # Update existing citation with potentially better data + for i, existing_citation in enumerate(result): + if existing_citation.get("url") == url: + # Prefer higher relevance score + if citation.get("relevance_score", 0) > existing_citation.get( + "relevance_score", 0 + ): + # Update selectively instead of blindly merging all fields. + updated = existing_citation.copy() + # Always update relevance_score + if "relevance_score" in citation: + updated["relevance_score"] = citation["relevance_score"] + # Merge other metadata only if improved (here assuming non-empty is 'better') + for key in ("title", "description", "snippet"): + new_value = citation.get(key) + if new_value: + updated[key] = new_value + result[i] = updated + break + break + + return result + + +def citations_to_markdown_references(citations: List[Dict[str, Any]]) -> str: + """ + Convert citations list to markdown references section. + + Args: + citations: List of citation dictionaries + + Returns: + Markdown formatted references section + """ + if not citations: + return "" + + lines = ["## Key Citations", ""] + + for i, citation in enumerate(citations, 1): + title = citation.get("title", "Untitled") + url = citation.get("url", "") + domain = citation.get("domain", "") + + # Main reference link + lines.append(f"- [{title}]({url})") + + # Add metadata as comment for parsing + metadata_parts = [] + if domain: + metadata_parts.append(f"domain: {domain}") + if citation.get("relevance_score"): + metadata_parts.append(f"score: {citation['relevance_score']:.2f}") + + if metadata_parts: + lines.append(f" ") + + lines.append("") # Empty line between citations + + return "\n".join(lines) diff --git a/src/citations/formatter.py b/src/citations/formatter.py new file mode 100644 index 0000000..e751223 --- /dev/null +++ b/src/citations/formatter.py @@ -0,0 +1,271 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +""" +Citation formatter for generating citation sections and inline references. +""" + +import re +from typing import Dict, List, Tuple + +from .models import Citation, CitationMetadata + + +class CitationFormatter: + """ + Formats citations for display in reports. + + Supports multiple citation styles: + - numbered: [1], [2], etc. + - superscript: ¹, ², etc. + - footnote: [^1], [^2], etc. + - inline: (Author, Year) or (Source) + """ + + SUPERSCRIPT_MAP = { + "0": "⁰", + "1": "¹", + "2": "²", + "3": "³", + "4": "⁴", + "5": "⁵", + "6": "⁶", + "7": "⁷", + "8": "⁸", + "9": "⁹", + } + + def __init__(self, style: str = "numbered"): + """ + Initialize the formatter. + + Args: + style: Citation style ('numbered', 'superscript', 'footnote', 'inline') + """ + self.style = style + + def format_inline_marker(self, number: int) -> str: + """ + Format an inline citation marker. + + Args: + number: The citation number + + Returns: + Formatted marker string + """ + if self.style == "superscript": + return "".join(self.SUPERSCRIPT_MAP.get(c, c) for c in str(number)) + elif self.style == "footnote": + return f"[^{number}]" + else: # numbered + return f"[{number}]" + + def format_reference(self, citation: Citation) -> str: + """ + Format a single reference for the citations section. + + Args: + citation: The citation to format + + Returns: + Formatted reference string + """ + metadata = citation.metadata + + # Build reference with available metadata + parts = [] + + # Number and title + parts.append(f"[{citation.number}] **{metadata.title}**") + + # Author if available + if metadata.author: + parts.append(f" *{metadata.author}*") + + # Domain/source + if metadata.domain: + parts.append(f" Source: {metadata.domain}") + + # Published date if available + if metadata.published_date: + parts.append(f" Published: {metadata.published_date}") + + # URL + parts.append(f" URL: {metadata.url}") + + # Description/snippet + if metadata.description: + snippet = metadata.description[:200] + if len(metadata.description) > 200: + snippet += "..." + parts.append(f" > {snippet}") + + return "\n".join(parts) + + def format_simple_reference(self, citation: Citation) -> str: + """ + Format a simple reference (title + URL). + + Args: + citation: The citation to format + + Returns: + Simple reference string + """ + return f"- [{citation.metadata.title}]({citation.metadata.url})" + + def format_rich_reference(self, citation: Citation) -> str: + """ + Format a rich reference with metadata as JSON-like annotation. + + Args: + citation: The citation to format + + Returns: + Rich reference string with metadata + """ + metadata = citation.metadata + parts = [f"- [{metadata.title}]({metadata.url})"] + + annotations = [] + if metadata.domain: + annotations.append(f"domain: {metadata.domain}") + if metadata.relevance_score > 0: + annotations.append(f"relevance: {metadata.relevance_score:.2f}") + if metadata.accessed_at: + annotations.append(f"accessed: {metadata.accessed_at[:10]}") + + if annotations: + parts.append(f" ") + + return "\n".join(parts) + + def format_citations_section( + self, citations: List[Citation], include_metadata: bool = True + ) -> str: + """ + Format the full citations section for a report. + + Args: + citations: List of citations to include + include_metadata: Whether to include rich metadata + + Returns: + Formatted citations section markdown + """ + if not citations: + return "" + + lines = ["## Key Citations", ""] + + for citation in citations: + if include_metadata: + lines.append(self.format_rich_reference(citation)) + else: + lines.append(self.format_simple_reference(citation)) + lines.append("") # Empty line between citations + + return "\n".join(lines) + + def format_footnotes_section(self, citations: List[Citation]) -> str: + """ + Format citations as footnotes (for footnote style). + + Args: + citations: List of citations + + Returns: + Footnotes section markdown + """ + if not citations: + return "" + + lines = ["", "---", ""] + for citation in citations: + lines.append( + f"[^{citation.number}]: {citation.metadata.title} - {citation.metadata.url}" + ) + + return "\n".join(lines) + + def add_citation_markers_to_text( + self, text: str, citations: List[Citation], url_to_number: Dict[str, int] + ) -> str: + """ + Add citation markers to text where URLs are referenced. + + Args: + text: The text to process + citations: Available citations + url_to_number: Mapping from URL to citation number + + Returns: + Text with citation markers added + """ + + # Find all markdown links and add citation numbers + def replace_link(match): + full_match = match.group(0) + url = match.group(2) + + if url in url_to_number: + number = url_to_number[url] + marker = self.format_inline_marker(number) + return f"{full_match}{marker}" + return full_match + + pattern = r"\[([^\]]+)\]\(([^)]+)\)" + return re.sub(pattern, replace_link, text) + + @staticmethod + def build_citation_data_json(citations: List[Citation]) -> str: + """ + Build a JSON block containing citation data for frontend use. + + Args: + citations: List of citations + + Returns: + JSON string with citation data + """ + import json + + data = { + "citations": [c.to_dict() for c in citations], + "count": len(citations), + } + + return json.dumps(data, ensure_ascii=False) + + +def parse_citations_from_report(report: str) -> List[Tuple[str, str]]: + """ + Parse citation links from a report's Key Citations section. + + Args: + report: The report markdown text + + Returns: + List of (title, url) tuples + """ + citations = [] + + # Find the Key Citations section + section_pattern = ( + r"(?:##\s*Key Citations|##\s*References|##\s*Sources)\s*\n(.*?)(?=\n##|\Z)" + ) + section_match = re.search(section_pattern, report, re.IGNORECASE | re.DOTALL) + + if section_match: + section = section_match.group(1) + + # Extract markdown links + link_pattern = r"\[([^\]]+)\]\(([^)]+)\)" + for match in re.finditer(link_pattern, section): + title = match.group(1) + url = match.group(2) + if url.startswith(("http://", "https://")): + citations.append((title, url)) + + return citations diff --git a/src/citations/models.py b/src/citations/models.py new file mode 100644 index 0000000..dec9707 --- /dev/null +++ b/src/citations/models.py @@ -0,0 +1,178 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +""" +Citation data models for structured source metadata. +""" + +import hashlib +from dataclasses import dataclass, field +from datetime import datetime +from typing import Any, Dict, List, Optional +from urllib.parse import urlparse + + +@dataclass +class CitationMetadata: + """Metadata extracted from a source.""" + + # Core identifiers + url: str + title: str + + # Content information + description: Optional[str] = None + content_snippet: Optional[str] = None + raw_content: Optional[str] = None + + # Source metadata + domain: Optional[str] = None + author: Optional[str] = None + published_date: Optional[str] = None + language: Optional[str] = None + + # Media + images: List[str] = field(default_factory=list) + favicon: Optional[str] = None + + # Quality indicators + relevance_score: float = 0.0 + credibility_score: float = 0.0 + + # Timestamps + accessed_at: str = field(default_factory=lambda: datetime.now().isoformat()) + + # Additional metadata + extra: Dict[str, Any] = field(default_factory=dict) + + def __post_init__(self): + """Extract domain from URL if not provided.""" + if not self.domain and self.url: + try: + parsed = urlparse(self.url) + self.domain = parsed.netloc + except Exception: + # If URL parsing fails for any reason, leave `domain` as None. + # This is a non-critical convenience field and failures here + # should not prevent citation metadata creation. + pass + + @property + def id(self) -> str: + """Generate a unique ID for this citation based on URL.""" + return hashlib.sha256(self.url.encode("utf-8")).hexdigest()[:12] + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for JSON serialization.""" + return { + "id": self.id, + "url": self.url, + "title": self.title, + "description": self.description, + "content_snippet": self.content_snippet, + "domain": self.domain, + "author": self.author, + "published_date": self.published_date, + "language": self.language, + "images": self.images, + "favicon": self.favicon, + "relevance_score": self.relevance_score, + "credibility_score": self.credibility_score, + "accessed_at": self.accessed_at, + "extra": self.extra, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "CitationMetadata": + """Create from dictionary.""" + # Remove 'id' as it's computed from url + data = {k: v for k, v in data.items() if k != "id"} + return cls(**data) + + @classmethod + def from_search_result( + cls, result: Dict[str, Any], query: str = "" + ) -> "CitationMetadata": + """Create citation metadata from a search result.""" + return cls( + url=result.get("url", ""), + title=result.get("title", "Untitled"), + description=result.get("content", result.get("description", "")), + content_snippet=result.get("content", "")[:500] + if result.get("content") + else None, + raw_content=result.get("raw_content"), + relevance_score=result.get("score", 0.0), + extra={"query": query, "result_type": result.get("type", "page")}, + ) + + +@dataclass +class Citation: + """ + A citation reference that can be used in reports. + + This represents a numbered citation that links to source metadata. + """ + + # Citation number (1-indexed for display) + number: int + + # Reference to the source metadata + metadata: CitationMetadata + + # Context where this citation is used + context: Optional[str] = None + + # Specific quote or fact being cited + cited_text: Optional[str] = None + + @property + def id(self) -> str: + """Get the citation ID from metadata.""" + return self.metadata.id + + @property + def url(self) -> str: + """Get the URL from metadata.""" + return self.metadata.url + + @property + def title(self) -> str: + """Get the title from metadata.""" + return self.metadata.title + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for JSON serialization.""" + return { + "number": self.number, + "metadata": self.metadata.to_dict(), + "context": self.context, + "cited_text": self.cited_text, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "Citation": + """Create from dictionary.""" + return cls( + number=data["number"], + metadata=CitationMetadata.from_dict(data["metadata"]), + context=data.get("context"), + cited_text=data.get("cited_text"), + ) + + def to_markdown_reference(self) -> str: + """Generate markdown reference format: [Title](URL)""" + return f"[{self.title}]({self.url})" + + def to_numbered_reference(self) -> str: + """Generate numbered reference format: [1] Title - URL""" + return f"[{self.number}] {self.title} - {self.url}" + + def to_inline_marker(self) -> str: + """Generate inline citation marker: [^1]""" + return f"[^{self.number}]" + + def to_footnote(self) -> str: + """Generate footnote definition: [^1]: Title - URL""" + return f"[^{self.number}]: {self.title} - {self.url}" diff --git a/src/graph/nodes.py b/src/graph/nodes.py index 3f2e47c..69ef232 100644 --- a/src/graph/nodes.py +++ b/src/graph/nodes.py @@ -14,6 +14,7 @@ from langchain_mcp_adapters.client import MultiServerMCPClient from langgraph.types import Command, interrupt from src.agents import create_agent +from src.citations import extract_citations_from_messages, merge_citations from src.config.agents import AGENT_LLM_MAP from src.config.configuration import Configuration from src.llms.llm import get_llm_by_type, get_llm_token_limit_by_type @@ -715,6 +716,7 @@ def coordinator_node( "clarified_research_topic": clarified_topic, "is_clarification_complete": False, "goto": goto, + "citations": state.get("citations", []), "__interrupt__": [("coordinator", response.content)], }, goto=goto, @@ -802,6 +804,7 @@ def coordinator_node( "clarification_history": clarification_history, "is_clarification_complete": goto != "coordinator", "goto": goto, + "citations": state.get("citations", []), }, goto=goto, ) @@ -822,14 +825,32 @@ def reporter_node(state: State, config: RunnableConfig): } invoke_messages = apply_prompt_template("reporter", input_, configurable, input_.get("locale", "en-US")) observations = state.get("observations", []) + + # Get collected citations for the report + citations = state.get("citations", []) - # Add a reminder about the new report format, citation style, and table usage - invoke_messages.append( - HumanMessage( - content="IMPORTANT: Structure your report according to the format in the prompt. Remember to include:\n\n1. Key Points - A bulleted list of the most important findings\n2. Overview - A brief introduction to the topic\n3. Detailed Analysis - Organized into logical sections\n4. Survey Note (optional) - For more comprehensive reports\n5. Key Citations - List all references at the end\n\nFor citations, DO NOT include inline citations in the text. Instead, place all citations in the 'Key Citations' section at the end using the format: `- [Source Title](URL)`. Include an empty line between each citation for better readability.\n\nPRIORITIZE USING MARKDOWN TABLES for data presentation and comparison. Use tables whenever presenting comparative data, statistics, features, or options. Structure tables with clear headers and aligned columns. Example table format:\n\n| Feature | Description | Pros | Cons |\n|---------|-------------|------|------|\n| Feature 1 | Description 1 | Pros 1 | Cons 1 |\n| Feature 2 | Description 2 | Pros 2 | Cons 2 |", - name="system", + # If we have collected citations, provide them to the reporter + if citations: + citation_list = "\n\n## Available Source References (use these in References section):\n\n" + for i, citation in enumerate(citations, 1): + title = citation.get("title", "Untitled") + url = citation.get("url", "") + domain = citation.get("domain", "") + description = citation.get("description", "") + desc_truncated = description[:150] if description else "" + citation_list += f"{i}. **{title}**\n - URL: {url}\n - Domain: {domain}\n" + if desc_truncated: + citation_list += f" - Summary: {desc_truncated}...\n" + citation_list += "\n" + + logger.info(f"Providing {len(citations)} collected citations to reporter") + + invoke_messages.append( + HumanMessage( + content=citation_list, + name="system", + ) ) - ) observation_messages = [] for observation in observations: @@ -852,7 +873,10 @@ def reporter_node(state: State, config: RunnableConfig): response_content = response.content logger.info(f"reporter response: {response_content}") - return {"final_report": response_content} + return { + "final_report": response_content, + "citations": citations, # Pass citations through to final state + } def research_team_node(state: State): @@ -1114,11 +1138,23 @@ async def _execute_agent_step( f"All tool results will be preserved and streamed to frontend." ) + # Extract citations from tool call results (web_search, crawl) + existing_citations = state.get("citations", []) + new_citations = extract_citations_from_messages(agent_messages) + merged_citations = merge_citations(existing_citations, new_citations) + + if new_citations: + logger.info( + f"Extracted {len(new_citations)} new citations from {agent_name} agent. " + f"Total citations: {len(merged_citations)}" + ) + return Command( update={ + **preserve_state_meta_fields(state), "messages": agent_messages, "observations": observations + [response_content + validation_info], - **preserve_state_meta_fields(state), + "citations": merged_citations, # Store merged citations based on existing state and new tool results }, goto="research_team", ) diff --git a/src/graph/types.py b/src/graph/types.py index a977de7..64abf4e 100644 --- a/src/graph/types.py +++ b/src/graph/types.py @@ -3,6 +3,7 @@ from dataclasses import field +from typing import Any from langgraph.graph import MessagesState @@ -27,6 +28,10 @@ class State(MessagesState): auto_accepted_plan: bool = False enable_background_investigation: bool = True background_investigation_results: str = None + + # Citation metadata collected during research + # Format: List of citation dictionaries with url, title, description, etc. + citations: list[dict[str, Any]] = field(default_factory=list) # Clarification state tracking (disabled by default) enable_clarification: bool = ( diff --git a/src/prompts/reporter.md b/src/prompts/reporter.md index 3146f28..69b929d 100644 --- a/src/prompts/reporter.md +++ b/src/prompts/reporter.md @@ -372,9 +372,12 @@ Structure your report in the following format: - If uncertain about any information, acknowledge the uncertainty. - Only include verifiable facts from the provided source material. -- Place all citations in the "Key Citations" section at the end, not inline in the text. -- For each citation, use the format: `- [Source Title](URL)` -- Include an empty line between each citation for better readability. +- Structure your report to include: Key Points, Overview, Detailed Analysis, Survey Note (optional), and References. +- Use inline citations [n] in the text where appropriate. +- The number n must correspond to the source index in the provided 'Available Source References' list. +- Make the inline citation a link to the reference at the bottom using the format `[[n]](#ref-n)`. +- In the References section at the end, list the sources using the format `[[n]](#citation-target-n) **[Title](URL)**`. +- PRIORITIZE USING MARKDOWN TABLES for data presentation and comparison. Use tables whenever presenting comparative data, statistics, features, or options. - Include images using `![Image Description](image_url)`. The images should be in the middle of the report, not at the end or separate section. - The included images should **only** be from the information gathered **from the previous steps**. **Never** include images that are not from the previous steps - Directly output the Markdown raw content without "```markdown" or "```". diff --git a/src/prompts/reporter.zh_CN.md b/src/prompts/reporter.zh_CN.md index 87d47cf..4a528d9 100644 --- a/src/prompts/reporter.zh_CN.md +++ b/src/prompts/reporter.zh_CN.md @@ -370,9 +370,12 @@ CURRENT_TIME: {{ CURRENT_TIME }} - 如果对任何信息不确定,确认不确定性。 - 仅包括来自提供的源资料的可验证事实。 -- 将所有引用放在末尾的"关键引文"部分,而不是文本中的内联。 -- 对于每个引用,使用格式:`- [来源标题](URL)` -- 在每个引文之间包括一个空行以获得更好的可读性。 +- 报告结构应包含:核心要点、概述、详细分析、调查说明(可选)和参考文献。 +- 在正文适当位置使用内联引用 [n]。 +- 数字 n 必须对应提供的"可用来源参考"列表中的索引。 +- 将内联引用设为指向底部参考文献的链接,格式为 `[[n]](#ref-n)`。 +- 在末尾的参考文献部分,使用格式 `[[n]](#citation-target-n) **[标题](URL)**` 列出来源。 +- 优先使用 Markdown 表格进行数据展示和比较。在展示对比数据、统计数据、特性或选项时,请务必使用表格。 - 使用`![图像说明](图像URL)`包括图像。图像应该在报告的中间,而不是末尾或单独的部分。 - 包含的图像应**仅**来自**从之前步骤中**收集的信息。**绝不**包括不来自之前步骤的图像 - 直接输出Markdown原始内容,不带"```markdown"或"```"。 diff --git a/src/server/app.py b/src/server/app.py index d473ec5..078a935 100644 --- a/src/server/app.py +++ b/src/server/app.py @@ -37,6 +37,7 @@ from src.config.configuration import get_recursion_limit from src.config.loader import get_bool_env, get_int_env, get_str_env from src.config.report_style import ReportStyle from src.config.tools import SELECTED_RAG_PROVIDER +from src.citations import merge_citations from src.graph.builder import build_graph_with_memory from src.graph.checkpoint import chat_stream_message from src.graph.utils import ( @@ -584,14 +585,69 @@ async def _process_message_chunk(message_chunk, message_metadata, thread_id, age yield _make_event("message_chunk", event_stream_message) +def extract_citations_from_event(event: Any, safe_thread_id: str = "unknown") -> list: + """Extract all citations from event data using an iterative, depth-limited traversal.""" + # Only dict-based event structures are supported + if not isinstance(event, dict): + return [] + + from collections import deque + citations: list[Any] = [] + max_depth = 5 # Prevent excessively deep traversal + max_nodes = 5000 # Safety cap to avoid pathological large structures + + # Queue holds (node_dict, depth) for BFS traversal + queue: deque[tuple[dict[str, Any], int]] = deque([(event, 0)]) + nodes_visited = 0 + + while queue: + current, depth = queue.popleft() + nodes_visited += 1 + if nodes_visited > max_nodes: + logger.warning( + f"[{safe_thread_id}] Stopping citation extraction after visiting " + f"{nodes_visited} nodes to avoid performance issues" + ) + break + + # Direct citations field at this level + direct_citations = current.get("citations") + if isinstance(direct_citations, list) and direct_citations: + logger.debug( + f"[{safe_thread_id}] Found {len(direct_citations)} citations at depth {depth}" + ) + citations.extend(direct_citations) + + # Do not traverse deeper than max_depth + if depth >= max_depth: + continue + + # Check nested values (for updates mode) + for value in current.values(): + if isinstance(value, dict): + queue.append((value, depth + 1)) + # Also check if the value is a list of dicts (like Command updates) + elif isinstance(value, list): + for item in value: + if isinstance(item, dict): + queue.append((item, depth + 1)) + return citations + + async def _stream_graph_events( graph_instance, workflow_input, workflow_config, thread_id ): """Stream events from the graph and process them.""" safe_thread_id = sanitize_thread_id(thread_id) logger.debug(f"[{safe_thread_id}] Starting graph event stream with agent nodes") + + # Track citations collected during research + collected_citations = [] + try: event_count = 0 + last_state_update = None # Track the last state update to get final citations + async for agent, _, event_data in graph_instance.astream( workflow_input, config=workflow_config, @@ -603,6 +659,24 @@ async def _stream_graph_events( logger.debug(f"[{safe_thread_id}] Graph event #{event_count} received from agent: {safe_agent}") if isinstance(event_data, dict): + # Store the last state update for final citation extraction + last_state_update = event_data + + # Log event keys for debugging (more verbose for citations debugging) + event_keys = list(event_data.keys()) + + # Check for citations in state updates (may be nested) + new_citations = extract_citations_from_event(event_data, safe_thread_id) + if new_citations: + # Accumulate citations across events instead of overwriting + # using merge_citations to avoid duplicates and preserve better metadata + collected_citations = merge_citations(collected_citations, new_citations) + # Key difference: replace string heuristic with actual extraction count for logging + logger.info( + f"[{safe_thread_id}] Event contains citations, " + f"keys: {event_keys}, count: {len(new_citations)}, total: {len(collected_citations)}" + ) + if "__interrupt__" in event_data: logger.debug( f"[{safe_thread_id}] Processing interrupt event: " @@ -631,6 +705,40 @@ async def _stream_graph_events( ): yield event + # After streaming completes, try to get citations + # First check if we collected any during streaming + if not collected_citations and last_state_update: + # Try to get citations from the last state update + logger.debug(f"[{safe_thread_id}] No citations collected during streaming, checking last state update") + collected_citations = extract_citations_from_event(last_state_update, safe_thread_id) + + # If still no citations, try to get from graph state directly + if not collected_citations: + try: + # Get the current state from the graph using proper config + state_config = {"configurable": {"thread_id": thread_id}} + current_state = await graph_instance.aget_state(state_config) + if current_state and hasattr(current_state, 'values'): + state_values = current_state.values + if isinstance(state_values, dict) and 'citations' in state_values: + collected_citations = state_values.get('citations', []) + logger.info(f"[{safe_thread_id}] Retrieved {len(collected_citations)} citations from final graph state") + except Exception as e: + logger.warning( + f"[{safe_thread_id}] Could not retrieve citations from graph state: {e}", + exc_info=True, + ) + + # Send collected citations as a separate event + if collected_citations: + logger.info(f"[{safe_thread_id}] Sending {len(collected_citations)} citations to client") + yield _make_event("citations", { + "thread_id": thread_id, + "citations": collected_citations, + }) + else: + logger.debug(f"[{safe_thread_id}] No citations to send") + logger.debug(f"[{safe_thread_id}] Graph event stream completed. Total events: {event_count}") except asyncio.CancelledError: # User cancelled/interrupted the stream - this is normal, not an error diff --git a/tests/unit/citations/test_citations.py b/tests/unit/citations/test_citations.py new file mode 100644 index 0000000..89dae5a --- /dev/null +++ b/tests/unit/citations/test_citations.py @@ -0,0 +1,136 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +from langchain_core.messages import ToolMessage + +from src.citations.collector import CitationCollector +from src.citations.extractor import ( + _extract_domain, + citations_to_markdown_references, + extract_citations_from_messages, + merge_citations, +) +from src.citations.formatter import CitationFormatter +from src.citations.models import Citation, CitationMetadata + + +class TestCitationMetadata: + def test_initialization(self): + meta = CitationMetadata( + url="https://example.com/page", + title="Example Page", + description="An example description", + ) + assert meta.url == "https://example.com/page" + assert meta.title == "Example Page" + assert meta.description == "An example description" + assert meta.domain == "example.com" # Auto-extracted in post_init + + def test_id_generation(self): + meta = CitationMetadata(url="https://example.com", title="Test") + # Just check it's a non-empty string, length 12 + assert len(meta.id) == 12 + assert isinstance(meta.id, str) + + def test_to_dict(self): + meta = CitationMetadata( + url="https://example.com", title="Test", relevance_score=0.8 + ) + data = meta.to_dict() + assert data["url"] == "https://example.com" + assert data["title"] == "Test" + assert data["relevance_score"] == 0.8 + assert "id" in data + + +class TestCitation: + def test_citation_wrapper(self): + meta = CitationMetadata(url="https://example.com", title="Test") + citation = Citation(number=1, metadata=meta) + + assert citation.number == 1 + assert citation.url == "https://example.com" + assert citation.title == "Test" + assert citation.to_markdown_reference() == "[Test](https://example.com)" + assert citation.to_numbered_reference() == "[1] Test - https://example.com" + + +class TestExtractor: + def test_extract_from_tool_message_web_search(self): + search_result = { + "results": [ + { + "url": "https://example.com/1", + "title": "Result 1", + "content": "Content 1", + "score": 0.9, + } + ] + } + + msg = ToolMessage( + content=str(search_result).replace("'", '"'), # Simple JSON dump simulation + tool_call_id="call_1", + name="web_search", + ) + # Mocking json structure if ToolMessage content expects stringified JSON + import json + + msg.content = json.dumps(search_result) + + citations = extract_citations_from_messages([msg]) + assert len(citations) == 1 + assert citations[0]["url"] == "https://example.com/1" + assert citations[0]["title"] == "Result 1" + + def test_extract_domain(self): + assert _extract_domain("https://www.example.com/path") == "www.example.com" + assert _extract_domain("http://example.org") == "example.org" + + def test_merge_citations(self): + existing = [{"url": "https://a.com", "title": "A", "relevance_score": 0.5}] + new = [ + {"url": "https://b.com", "title": "B", "relevance_score": 0.6}, + { + "url": "https://a.com", + "title": "A New", + "relevance_score": 0.7, + }, # Better score for A + ] + + merged = merge_citations(existing, new) + assert len(merged) == 2 + + # Check A was updated + a_citation = next(c for c in merged if c["url"] == "https://a.com") + assert a_citation["relevance_score"] == 0.7 + + # Check B is present + b_citation = next(c for c in merged if c["url"] == "https://b.com") + assert b_citation["title"] == "B" + + def test_citations_to_markdown(self): + citations = [{"url": "https://a.com", "title": "A", "description": "Desc A"}] + md = citations_to_markdown_references(citations) + assert "## Key Citations" in md + assert "- [A](https://a.com)" in md + + +class TestCollector: + def test_add_citations(self): + collector = CitationCollector() + results = [ + {"url": "https://example.com", "title": "Example", "content": "Test"} + ] + added = collector.add_from_search_results(results, query="test") + + assert len(added) == 1 + assert added[0].url == "https://example.com" + assert collector.count == 1 + + +class TestFormatter: + def test_format_inline(self): + formatter = CitationFormatter(style="superscript") + assert formatter.format_inline_marker(1) == "¹" + assert formatter.format_inline_marker(12) == "¹²" diff --git a/tests/unit/config/test_configuration.py b/tests/unit/config/test_configuration.py index 8551d47..823efd1 100644 --- a/tests/unit/config/test_configuration.py +++ b/tests/unit/config/test_configuration.py @@ -135,17 +135,18 @@ def test_from_runnable_config_with_boolean_true_values(): assert config.enable_deep_thinking is True assert config.enforce_web_search is True - -def test_get_recursion_limit_default(): +def test_get_recursion_limit_default(monkeypatch): from src.config.configuration import get_recursion_limit + monkeypatch.delenv("AGENT_RECURSION_LIMIT", raising=False) result = get_recursion_limit() assert result == 25 -def test_get_recursion_limit_custom_default(): +def test_get_recursion_limit_custom_default(monkeypatch): from src.config.configuration import get_recursion_limit + monkeypatch.delenv("AGENT_RECURSION_LIMIT", raising=False) result = get_recursion_limit(50) assert result == 50 diff --git a/web/package.json b/web/package.json index 7d33081..93b3411 100644 --- a/web/package.json +++ b/web/package.json @@ -27,6 +27,7 @@ "@radix-ui/react-collapsible": "^1.1.8", "@radix-ui/react-dialog": "^1.1.10", "@radix-ui/react-dropdown-menu": "^2.1.11", + "@radix-ui/react-hover-card": "^1.1.6", "@radix-ui/react-icons": "^1.3.2", "@radix-ui/react-label": "^2.1.4", "@radix-ui/react-popover": "^1.1.11", diff --git a/web/pnpm-lock.yaml b/web/pnpm-lock.yaml index bc56d80..f351270 100644 --- a/web/pnpm-lock.yaml +++ b/web/pnpm-lock.yaml @@ -29,6 +29,9 @@ importers: '@radix-ui/react-dropdown-menu': specifier: ^2.1.11 version: 2.1.11(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) + '@radix-ui/react-hover-card': + specifier: ^1.1.6 + version: 1.1.15(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) '@radix-ui/react-icons': specifier: ^1.3.2 version: 1.3.2(react@19.1.0) @@ -1030,6 +1033,9 @@ packages: '@radix-ui/primitive@1.1.2': resolution: {integrity: sha512-XnbHrrprsNqZKQhStrSwgRUQzoCI1glLzdw79xiZPoofhGICeZRSQ3dIxAKH1gb3OHfNf4d6f+vAv3kil2eggA==} + '@radix-ui/primitive@1.1.3': + resolution: {integrity: sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==} + '@radix-ui/react-accordion@1.2.8': resolution: {integrity: sha512-c7OKBvO36PfQIUGIjj1Wko0hH937pYFU2tR5zbIJDUsmTzHoZVHHt4bmb7OOJbzTaWJtVELKWojBHa7OcnUHmQ==} peerDependencies: @@ -1069,6 +1075,19 @@ packages: '@types/react-dom': optional: true + '@radix-ui/react-arrow@1.1.7': + resolution: {integrity: sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==} + peerDependencies: + '@types/react': '*' + '@types/react-dom': '*' + react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + react-dom: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + peerDependenciesMeta: + '@types/react': + optional: true + '@types/react-dom': + optional: true + '@radix-ui/react-checkbox@1.2.3': resolution: {integrity: sha512-pHVzDYsnaDmBlAuwim45y3soIN8H4R7KbkSVirGhXO+R/kO2OLCe0eucUEbddaTcdMHHdzcIGHtZSMSQlA+apw==} peerDependencies: @@ -1161,6 +1180,19 @@ packages: '@types/react': optional: true + '@radix-ui/react-dismissable-layer@1.1.11': + resolution: {integrity: sha512-Nqcp+t5cTB8BinFkZgXiMJniQH0PsUt2k51FUhbdfeKvc4ACcG2uQniY/8+h1Yv6Kza4Q7lD7PQV0z0oicE0Mg==} + peerDependencies: + '@types/react': '*' + '@types/react-dom': '*' + react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + react-dom: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + peerDependenciesMeta: + '@types/react': + optional: true + '@types/react-dom': + optional: true + '@radix-ui/react-dismissable-layer@1.1.6': resolution: {integrity: sha512-7gpgMT2gyKym9Jz2ZhlRXSg2y6cNQIK8d/cqBZ0RBCaps8pFryCWXiUKI+uHGFrhMrbGUP7U6PWgiXzIxoyF3Q==} peerDependencies: @@ -1222,6 +1254,19 @@ packages: '@types/react-dom': optional: true + '@radix-ui/react-hover-card@1.1.15': + resolution: {integrity: sha512-qgTkjNT1CfKMoP0rcasmlH2r1DAiYicWsDsufxl940sT2wHNEWWv6FMWIQXWhVdmC1d/HYfbhQx60KYyAtKxjg==} + peerDependencies: + '@types/react': '*' + '@types/react-dom': '*' + react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + react-dom: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + peerDependenciesMeta: + '@types/react': + optional: true + '@types/react-dom': + optional: true + '@radix-ui/react-icons@1.3.2': resolution: {integrity: sha512-fyQIhGDhzfc9pK2kH6Pl9c4BDJGfMkPqkyIgYDthyNYoNg3wVhoJMMh19WS4Up/1KMPFVpNsT2q3WmXn2N1m6g==} peerDependencies: @@ -1301,6 +1346,19 @@ packages: '@types/react-dom': optional: true + '@radix-ui/react-popper@1.2.8': + resolution: {integrity: sha512-0NJQ4LFFUuWkE7Oxf0htBKS6zLkkjBH+hM1uk7Ng705ReR8m/uelduy1DBo0PyBXPKVnBA6YBlU94MBGXrSBCw==} + peerDependencies: + '@types/react': '*' + '@types/react-dom': '*' + react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + react-dom: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + peerDependenciesMeta: + '@types/react': + optional: true + '@types/react-dom': + optional: true + '@radix-ui/react-portal@1.1.5': resolution: {integrity: sha512-ps/67ZqsFm+Mb6lSPJpfhRLrVL2i2fntgCmGMqqth4eaGUf+knAuuRtWVJrNjUhExgmdRqftSgzpf0DF0n6yXA==} peerDependencies: @@ -1327,6 +1385,19 @@ packages: '@types/react-dom': optional: true + '@radix-ui/react-portal@1.1.9': + resolution: {integrity: sha512-bpIxvq03if6UNwXZ+HTK71JLh4APvnXntDc6XOX8UVq4XQOVl7lwok0AvIl+b8zgCw3fSaVTZMpAPPagXbKmHQ==} + peerDependencies: + '@types/react': '*' + '@types/react-dom': '*' + react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + react-dom: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + peerDependenciesMeta: + '@types/react': + optional: true + '@types/react-dom': + optional: true + '@radix-ui/react-presence@1.1.3': resolution: {integrity: sha512-IrVLIhskYhH3nLvtcBLQFZr61tBG7wx7O3kEmdzcYwRGAEBmBicGGL7ATzNgruYJ3xBTbuzEEq9OXJM3PAX3tA==} peerDependencies: @@ -1353,6 +1424,19 @@ packages: '@types/react-dom': optional: true + '@radix-ui/react-presence@1.1.5': + resolution: {integrity: sha512-/jfEwNDdQVBCNvjkGit4h6pMOzq8bHkopq458dPt2lMjx+eBQUohZNG9A7DtO/O5ukSbxuaNGXMjHicgwy6rQQ==} + peerDependencies: + '@types/react': '*' + '@types/react-dom': '*' + react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + react-dom: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + peerDependenciesMeta: + '@types/react': + optional: true + '@types/react-dom': + optional: true + '@radix-ui/react-primitive@2.0.3': resolution: {integrity: sha512-Pf/t/GkndH7CQ8wE2hbkXA+WyZ83fhQQn5DDmwDiDo6AwN/fhaH8oqZ0jRjMrO2iaMhDi6P1HRx6AZwyMinY1g==} peerDependencies: @@ -1379,6 +1463,19 @@ packages: '@types/react-dom': optional: true + '@radix-ui/react-primitive@2.1.3': + resolution: {integrity: sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==} + peerDependencies: + '@types/react': '*' + '@types/react-dom': '*' + react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + react-dom: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + peerDependenciesMeta: + '@types/react': + optional: true + '@types/react-dom': + optional: true + '@radix-ui/react-roving-focus@1.1.3': resolution: {integrity: sha512-ufbpLUjZiOg4iYgb2hQrWXEPYX6jOLBbR27bDyAff5GYMRrCzcze8lukjuXVUQvJ6HZe8+oL+hhswDcjmcgVyg==} peerDependencies: @@ -1466,6 +1563,15 @@ packages: '@types/react': optional: true + '@radix-ui/react-slot@1.2.3': + resolution: {integrity: sha512-aeNmHnBxbi2St0au6VBVC7JXFlhLlOnvIIlePNniyUNAClzmtAUEY8/pBiK3iHjufOlwA+c20/8jngo7xcrg8A==} + peerDependencies: + '@types/react': '*' + react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + peerDependenciesMeta: + '@types/react': + optional: true + '@radix-ui/react-switch@1.2.2': resolution: {integrity: sha512-7Z8n6L+ifMIIYZ83f28qWSceUpkXuslI2FJ34+kDMTiyj91ENdpdQ7VCidrzj5JfwfZTeano/BnGBbu/jqa5rQ==} peerDependencies: @@ -6616,6 +6722,8 @@ snapshots: '@radix-ui/primitive@1.1.2': {} + '@radix-ui/primitive@1.1.3': {} + '@radix-ui/react-accordion@1.2.8(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': dependencies: '@radix-ui/primitive': 1.1.2 @@ -6651,6 +6759,15 @@ snapshots: '@types/react': 19.1.2 '@types/react-dom': 19.1.1(@types/react@19.1.2) + '@radix-ui/react-arrow@1.1.7(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': + dependencies: + '@radix-ui/react-primitive': 2.1.3(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) + react: 19.1.0 + react-dom: 19.1.0(react@19.1.0) + optionalDependencies: + '@types/react': 19.1.2 + '@types/react-dom': 19.1.1(@types/react@19.1.2) + '@radix-ui/react-checkbox@1.2.3(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': dependencies: '@radix-ui/primitive': 1.1.2 @@ -6747,6 +6864,19 @@ snapshots: optionalDependencies: '@types/react': 19.1.2 + '@radix-ui/react-dismissable-layer@1.1.11(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': + dependencies: + '@radix-ui/primitive': 1.1.3 + '@radix-ui/react-compose-refs': 1.1.2(@types/react@19.1.2)(react@19.1.0) + '@radix-ui/react-primitive': 2.1.3(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) + '@radix-ui/react-use-callback-ref': 1.1.1(@types/react@19.1.2)(react@19.1.0) + '@radix-ui/react-use-escape-keydown': 1.1.1(@types/react@19.1.2)(react@19.1.0) + react: 19.1.0 + react-dom: 19.1.0(react@19.1.0) + optionalDependencies: + '@types/react': 19.1.2 + '@types/react-dom': 19.1.1(@types/react@19.1.2) + '@radix-ui/react-dismissable-layer@1.1.6(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': dependencies: '@radix-ui/primitive': 1.1.2 @@ -6805,6 +6935,23 @@ snapshots: '@types/react': 19.1.2 '@types/react-dom': 19.1.1(@types/react@19.1.2) + '@radix-ui/react-hover-card@1.1.15(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': + dependencies: + '@radix-ui/primitive': 1.1.3 + '@radix-ui/react-compose-refs': 1.1.2(@types/react@19.1.2)(react@19.1.0) + '@radix-ui/react-context': 1.1.2(@types/react@19.1.2)(react@19.1.0) + '@radix-ui/react-dismissable-layer': 1.1.11(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) + '@radix-ui/react-popper': 1.2.8(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) + '@radix-ui/react-portal': 1.1.9(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) + '@radix-ui/react-presence': 1.1.5(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) + '@radix-ui/react-primitive': 2.1.3(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) + '@radix-ui/react-use-controllable-state': 1.2.2(@types/react@19.1.2)(react@19.1.0) + react: 19.1.0 + react-dom: 19.1.0(react@19.1.0) + optionalDependencies: + '@types/react': 19.1.2 + '@types/react-dom': 19.1.1(@types/react@19.1.2) + '@radix-ui/react-icons@1.3.2(react@19.1.0)': dependencies: react: 19.1.0 @@ -6910,6 +7057,24 @@ snapshots: '@types/react': 19.1.2 '@types/react-dom': 19.1.1(@types/react@19.1.2) + '@radix-ui/react-popper@1.2.8(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': + dependencies: + '@floating-ui/react-dom': 2.1.2(react-dom@19.1.0(react@19.1.0))(react@19.1.0) + '@radix-ui/react-arrow': 1.1.7(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) + '@radix-ui/react-compose-refs': 1.1.2(@types/react@19.1.2)(react@19.1.0) + '@radix-ui/react-context': 1.1.2(@types/react@19.1.2)(react@19.1.0) + '@radix-ui/react-primitive': 2.1.3(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) + '@radix-ui/react-use-callback-ref': 1.1.1(@types/react@19.1.2)(react@19.1.0) + '@radix-ui/react-use-layout-effect': 1.1.1(@types/react@19.1.2)(react@19.1.0) + '@radix-ui/react-use-rect': 1.1.1(@types/react@19.1.2)(react@19.1.0) + '@radix-ui/react-use-size': 1.1.1(@types/react@19.1.2)(react@19.1.0) + '@radix-ui/rect': 1.1.1 + react: 19.1.0 + react-dom: 19.1.0(react@19.1.0) + optionalDependencies: + '@types/react': 19.1.2 + '@types/react-dom': 19.1.1(@types/react@19.1.2) + '@radix-ui/react-portal@1.1.5(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': dependencies: '@radix-ui/react-primitive': 2.0.3(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) @@ -6930,6 +7095,16 @@ snapshots: '@types/react': 19.1.2 '@types/react-dom': 19.1.1(@types/react@19.1.2) + '@radix-ui/react-portal@1.1.9(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': + dependencies: + '@radix-ui/react-primitive': 2.1.3(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) + '@radix-ui/react-use-layout-effect': 1.1.1(@types/react@19.1.2)(react@19.1.0) + react: 19.1.0 + react-dom: 19.1.0(react@19.1.0) + optionalDependencies: + '@types/react': 19.1.2 + '@types/react-dom': 19.1.1(@types/react@19.1.2) + '@radix-ui/react-presence@1.1.3(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': dependencies: '@radix-ui/react-compose-refs': 1.1.2(@types/react@19.1.2)(react@19.1.0) @@ -6950,6 +7125,16 @@ snapshots: '@types/react': 19.1.2 '@types/react-dom': 19.1.1(@types/react@19.1.2) + '@radix-ui/react-presence@1.1.5(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': + dependencies: + '@radix-ui/react-compose-refs': 1.1.2(@types/react@19.1.2)(react@19.1.0) + '@radix-ui/react-use-layout-effect': 1.1.1(@types/react@19.1.2)(react@19.1.0) + react: 19.1.0 + react-dom: 19.1.0(react@19.1.0) + optionalDependencies: + '@types/react': 19.1.2 + '@types/react-dom': 19.1.1(@types/react@19.1.2) + '@radix-ui/react-primitive@2.0.3(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': dependencies: '@radix-ui/react-slot': 1.2.0(@types/react@19.1.2)(react@19.1.0) @@ -6968,6 +7153,15 @@ snapshots: '@types/react': 19.1.2 '@types/react-dom': 19.1.1(@types/react@19.1.2) + '@radix-ui/react-primitive@2.1.3(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': + dependencies: + '@radix-ui/react-slot': 1.2.3(@types/react@19.1.2)(react@19.1.0) + react: 19.1.0 + react-dom: 19.1.0(react@19.1.0) + optionalDependencies: + '@types/react': 19.1.2 + '@types/react-dom': 19.1.1(@types/react@19.1.2) + '@radix-ui/react-roving-focus@1.1.3(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': dependencies: '@radix-ui/primitive': 1.1.2 @@ -7083,6 +7277,13 @@ snapshots: optionalDependencies: '@types/react': 19.1.2 + '@radix-ui/react-slot@1.2.3(@types/react@19.1.2)(react@19.1.0)': + dependencies: + '@radix-ui/react-compose-refs': 1.1.2(@types/react@19.1.2)(react@19.1.0) + react: 19.1.0 + optionalDependencies: + '@types/react': 19.1.2 + '@radix-ui/react-switch@1.2.2(@types/react-dom@19.1.1(@types/react@19.1.2))(@types/react@19.1.2)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': dependencies: '@radix-ui/primitive': 1.1.2 diff --git a/web/src/app/chat/components/research-report-block.tsx b/web/src/app/chat/components/research-report-block.tsx index 00807a8..792abd9 100644 --- a/web/src/app/chat/components/research-report-block.tsx +++ b/web/src/app/chat/components/research-report-block.tsx @@ -7,11 +7,12 @@ import { LoadingAnimation } from "~/components/deer-flow/loading-animation"; import { Markdown } from "~/components/deer-flow/markdown"; import ReportEditor from "~/components/editor"; import { useReplay } from "~/core/replay"; -import { useMessage, useStore } from "~/core/store"; +import { useCitations, useMessage, useStore } from "~/core/store"; import { cn } from "~/lib/utils"; export function ResearchReportBlock({ className, + researchId, messageId, editing, }: { @@ -21,6 +22,7 @@ export function ResearchReportBlock({ editing: boolean; }) { const message = useMessage(messageId); + const citations = useCitations(researchId); const { isReplay } = useReplay(); const handleMarkdownChange = useCallback( (markdown: string) => { @@ -61,7 +63,7 @@ export function ResearchReportBlock({ /> ) : ( <> - + {message?.content} {message?.isStreaming && } diff --git a/web/src/components/deer-flow/citation.tsx b/web/src/components/deer-flow/citation.tsx new file mode 100644 index 0000000..ccc1843 --- /dev/null +++ b/web/src/components/deer-flow/citation.tsx @@ -0,0 +1,308 @@ +// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +// SPDX-License-Identifier: MIT + +import { ExternalLink, Globe, Clock, Star } from "lucide-react"; +import { useMemo } from "react"; + +import { + HoverCard, + HoverCardContent, + HoverCardTrigger, +} from "~/components/ui/hover-card"; +import { cn } from "~/lib/utils"; +import type { Citation } from "~/core/messages"; + +// Re-export Citation type as CitationData for backward compatibility +export type CitationData = Citation; + +interface CitationLinkProps { + href: string; + children: React.ReactNode; + citations: CitationData[]; + className?: string; + id?: string; +} + +/** + * Enhanced link component that shows citation metadata on hover. + * Used within markdown content to provide rich citation information. + */ +export function CitationLink({ + href, + children, + citations, + className, + id, +}: CitationLinkProps) { + // Find matching citation data for this URL + const { citation, index } = useMemo(() => { + if (!href || !citations) return { citation: null, index: -1 }; + + // Try exact match first + let matchIndex = citations.findIndex((c) => c.url === href); + + // If not found, try versatile comparison using normalized URLs + if (matchIndex === -1) { + const normalizeUrl = (url: string) => { + try { + return decodeURIComponent(url).trim(); + } catch { + return url.trim(); + } + }; + + const normalizedHref = normalizeUrl(href); + + matchIndex = citations.findIndex( + (c) => normalizeUrl(c.url) === normalizedHref + ); + } + + const match = matchIndex !== -1 ? citations[matchIndex] : null; + + return { citation: match, index: matchIndex }; + }, [href, citations]); + + // If no citation data found, render as regular link + if (!citation) { + return ( + + {children} + + ); + } + + const handleCitationClick = (e: React.MouseEvent) => { + // If it's an internal-looking citation (e.g. [1]) + // or if the user clicks the citation number in the text + // we try to scroll to the reference list at the bottom + if (index !== -1) { + const targetId = `ref-${index + 1}`; + const element = document.getElementById(targetId); + if (element) { + e.preventDefault(); + element.scrollIntoView({ behavior: "smooth", block: "start" }); + } + } + // If element not found or index is -1, let the default behavior (open URL) happen + }; + + return ( + + + + {children} + + + + + + + + + + ); +} + +interface CitationCardProps { + citation: CitationData; + compact?: boolean; +} + +/** + * Card component displaying citation metadata. + */ +export function CitationCard({ citation, compact = false }: CitationCardProps) { + const { + title, + url, + description, + domain, + relevance_score, + accessed_at, + source_type, + } = citation; + + // Format access date + const formattedDate = useMemo(() => { + if (!accessed_at) return null; + try { + const date = new Date(accessed_at); + return date.toLocaleDateString(undefined, { + year: "numeric", + month: "short", + day: "numeric", + }); + } catch { + return accessed_at.slice(0, 10); + } + }, [accessed_at]); + + // Format relevance score as percentage + const relevancePercent = useMemo(() => { + if (relevance_score == null || relevance_score <= 0) return null; + return Math.round(relevance_score * 100); + }, [relevance_score]); + + return ( + + {/* Title */} + + {title} + + + {/* Domain and metadata row */} + + {domain && ( + + + {domain} + + )} + {formattedDate && ( + + + {formattedDate} + + )} + {relevancePercent != null && ( + + + {relevancePercent}% match + + )} + + + {/* Description/snippet */} + {description && !compact && ( + + {description} + + )} + + {/* Source type badge */} + {source_type && ( + + {source_type === "web_search" ? "Web" : source_type} + + )} + + {/* URL preview */} + + {url} + + + ); +} + +interface CitationListProps { + citations: CitationData[]; + title?: string; + className?: string; +} + +/** + * List component for displaying all citations. + */ +export function CitationList({ + citations, + title = "Sources", + className, +}: CitationListProps) { + if (!citations || citations.length === 0) { + return null; + } + + return ( +
+

{title}

+
+ {citations.map((citation, index) => ( +
+
+ + {index + 1} + +
+ + {citation.title} + + {citation.domain && ( +

+ {citation.domain} +

+ )} + {citation.description && ( +

+ {citation.description} +

+ )} +
+
+
+ ))} +
+
+ ); +} + +interface CitationBadgeProps { + number: number; + citation?: CitationData; + onClick?: () => void; +} + +/** + * Small numbered badge for inline citations. + */ +export function CitationBadge({ number, citation, onClick }: CitationBadgeProps) { + const badge = ( + + ); + + if (!citation) { + return badge; + } + + return ( + + {badge} + + + + + ); +} diff --git a/web/src/components/deer-flow/markdown.tsx b/web/src/components/deer-flow/markdown.tsx index 6786b18..6eb612a 100644 --- a/web/src/components/deer-flow/markdown.tsx +++ b/web/src/components/deer-flow/markdown.tsx @@ -20,6 +20,7 @@ import { cn } from "~/lib/utils"; import Image from "./image"; import { Tooltip } from "./tooltip"; import { Link } from "./link"; +import { CitationLink, type CitationData } from "./citation"; export function Markdown({ className, @@ -28,6 +29,7 @@ export function Markdown({ enableCopy, animated = false, checkLinkCredibility = false, + citations = [], ...props }: ReactMarkdownOptions & { className?: string; @@ -35,21 +37,127 @@ export function Markdown({ style?: React.CSSProperties; animated?: boolean; checkLinkCredibility?: boolean; + citations?: CitationData[]; }) { + // Pre-compute normalized URL map for O(1) lookup + const citationMap = useMemo(() => { + const map = new Map(); + citations?.forEach((c, index) => { + if (!c.url) return; + + // Add exact match + map.set(c.url, index); + + // Add decoded match + try { + const decoded = decodeURIComponent(c.url); + if (decoded !== c.url) map.set(decoded, index); + } catch {} + + // Add encoded match + try { + const encoded = encodeURI(c.url); + if (encoded !== c.url) map.set(encoded, index); + } catch {} + }); + return map; + }, [citations]); + const components: ReactMarkdownOptions["components"] = useMemo(() => { return { - a: ({ href, children }) => ( - - {children} - - ), + a: ({ href, children }) => { + const hrefStr = href ?? ""; + + // Handle citation anchor targets (rendered in Reference list) + // Format: [[n]](#citation-target-n) + const targetMatch = hrefStr.match(/^#citation-target-(\d+)$/); + if (targetMatch) { + const index = targetMatch[1]; + return ( + + [{index}] + + ); + } + + // Handle inline citation links (rendered in text) + // Format: [[n]](#ref-n), [n](#ref1), [n](#1) + const linkMatch = hrefStr.match(/^#(?:ref-?)?(\d+)$/); + if (linkMatch) { + return ( + { + e.preventDefault(); + const targetId = `ref-${linkMatch[1]}`; + const element = document.getElementById(targetId); + if (element) { + element.scrollIntoView({ behavior: "smooth", block: "start" }); + } + }} + > + {children} + + ); + } + + // If we have citation data, use CitationLink for enhanced display + if (citations && citations.length > 0) { + // Find if this URL is one of our citations + const citationIndex = citationMap.get(hrefStr) ?? -1; + + if (citationIndex !== -1) { + // Heuristic to determine if this is a citation target (in Reference list) + // vs a citation link (in text). + // Targets are usually the full title, while links are numbers like [1]. + const childrenText = Array.isArray(children) + ? children.join("") + : String(children); + // Heuristic: inline citation text usually looks like a numeric marker + // rather than a full title. We treat the following as "inline": + // "1", "[1]", "^1^", "[^1]" (with optional surrounding whitespace). + // This pattern matches either: + // - a bracketed number: "[1]" + // - a caret-style number: "1", "^1", "1^", "^1^" + // and ignores surrounding whitespace. + const inlineCitationPattern = /^\s*(?:\[\d+\]|\^?\d+\^?)\s*$/; + const isInline = inlineCitationPattern.test(childrenText); + + return ( + + {children} + + ); + } + + return ( + + {children} + + ); + } + // Otherwise fall back to regular Link + return ( + + {children} + + ); + }, img: ({ src, alt }) => ( {alt ), }; - }, [checkLinkCredibility]); + }, [checkLinkCredibility, citations, citationMap]); const rehypePlugins = useMemo>(() => { const plugins: NonNullable = [[ diff --git a/web/src/components/ui/hover-card.tsx b/web/src/components/ui/hover-card.tsx new file mode 100644 index 0000000..c9bfdb7 --- /dev/null +++ b/web/src/components/ui/hover-card.tsx @@ -0,0 +1,34 @@ +// Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +// SPDX-License-Identifier: MIT + +"use client"; + +import * as React from "react"; +import * as HoverCardPrimitive from "@radix-ui/react-hover-card"; + +import { cn } from "~/lib/utils"; + +const HoverCard = HoverCardPrimitive.Root; + +const HoverCardTrigger = HoverCardPrimitive.Trigger; + +const HoverCardContent = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, align = "center", sideOffset = 4, ...props }, ref) => ( + + + +)); +HoverCardContent.displayName = HoverCardPrimitive.Content.displayName; + +export { HoverCard, HoverCardTrigger, HoverCardContent }; diff --git a/web/src/core/api/types.ts b/web/src/core/api/types.ts index bb69572..4e96a61 100644 --- a/web/src/core/api/types.ts +++ b/web/src/core/api/types.ts @@ -1,7 +1,7 @@ // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates // SPDX-License-Identifier: MIT -import type { Option } from "../messages"; +import type { Citation, Option } from "../messages"; // Tool Calls @@ -76,9 +76,18 @@ export interface InterruptEvent } > {} +export interface CitationsEvent { + type: "citations"; + data: { + thread_id: string; + citations: Citation[]; + }; +} + export type ChatEvent = | MessageChunkEvent | ToolCallsEvent | ToolCallChunksEvent | ToolCallResultEvent - | InterruptEvent; + | InterruptEvent + | CitationsEvent; diff --git a/web/src/core/messages/merge-message.ts b/web/src/core/messages/merge-message.ts index a47fc73..4e8adc3 100644 --- a/web/src/core/messages/merge-message.ts +++ b/web/src/core/messages/merge-message.ts @@ -53,7 +53,7 @@ export function mergeMessage(message: Message, event: ChatEvent) { } else if (event.type === "interrupt") { mergeInterruptMessage(message, event); } - if (event.data.finish_reason) { + if (event.type !== "citations" && event.data.finish_reason) { message.finishReason = event.data.finish_reason; message.isStreaming = false; if (message.toolCalls) { diff --git a/web/src/core/messages/types.ts b/web/src/core/messages/types.ts index 05ca4bb..364e286 100644 --- a/web/src/core/messages/types.ts +++ b/web/src/core/messages/types.ts @@ -25,6 +25,7 @@ export interface Message { finishReason?: "stop" | "interrupt" | "tool_calls"; interruptFeedback?: string; resources?: Array; + citations?: Array; } export interface Option { @@ -45,3 +46,14 @@ export interface Resource { title: string; description?: string; } + +export interface Citation { + url: string; + title: string; + description?: string; + content_snippet?: string; + domain?: string; + relevance_score?: number; + accessed_at?: string; + source_type?: string; +} diff --git a/web/src/core/store/store.ts b/web/src/core/store/store.ts index d00935f..d725196 100644 --- a/web/src/core/store/store.ts +++ b/web/src/core/store/store.ts @@ -7,7 +7,7 @@ import { create } from "zustand"; import { useShallow } from "zustand/react/shallow"; import { chatStream, generatePodcast } from "../api"; -import type { Message, Resource } from "../messages"; +import type { Citation, Message, Resource } from "../messages"; import { mergeMessage } from "../messages"; import { parseJSON } from "../utils"; @@ -25,6 +25,7 @@ export const useStore = create<{ researchReportIds: Map; researchActivityIds: Map; researchQueries: Map; + researchCitations: Map; ongoingResearchId: string | null; openResearchId: string | null; @@ -34,6 +35,7 @@ export const useStore = create<{ openResearch: (researchId: string | null) => void; closeResearch: () => void; setOngoingResearch: (researchId: string | null) => void; + setCitations: (researchId: string, citations: Citation[]) => void; }>((set) => ({ responding: false, threadId: THREAD_ID, @@ -44,6 +46,7 @@ export const useStore = create<{ researchReportIds: new Map(), researchActivityIds: new Map(), researchQueries: new Map(), + researchCitations: new Map(), ongoingResearchId: null, openResearchId: null, @@ -80,6 +83,11 @@ export const useStore = create<{ setOngoingResearch(researchId: string | null) { set({ ongoingResearchId: researchId }); }, + setCitations(researchId: string, citations: Citation[]) { + set((state) => ({ + researchCitations: new Map(state.researchCitations).set(researchId, citations), + })); + }, })); export async function sendMessage( @@ -148,6 +156,15 @@ export async function sendMessage( const { type, data } = event; let message: Message | undefined; + // Handle citations event: store citations for the current research + if (type === "citations") { + const ongoingResearchId = useStore.getState().ongoingResearchId; + if (ongoingResearchId && data.citations) { + useStore.getState().setCitations(ongoingResearchId, data.citations); + } + continue; + } + // Handle tool_call_result specially: use the message that contains the tool call if (type === "tool_call_result") { message = findMessageByToolCallId(data.tool_call_id); @@ -496,3 +513,15 @@ export function useToolCalls() { }), ); } + +export function useCitations(researchId: string | null | undefined) { + return useStore( + useShallow((state) => + researchId ? state.researchCitations.get(researchId) ?? [] : [] + ), + ); +} + +export function getCitations(researchId: string): Citation[] { + return useStore.getState().researchCitations.get(researchId) ?? []; +}