mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-29 00:34:47 +08:00
* improve: Improved citation system * fix --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
286 lines
8.7 KiB
Python
286 lines
8.7 KiB
Python
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
"""
|
|
Citation collector for gathering and managing citations during research.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from .models import Citation, CitationMetadata
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class CitationCollector:
|
|
"""
|
|
Collects and manages citations during the research process.
|
|
|
|
This class handles:
|
|
- Collecting citations from search results and crawled pages
|
|
- Deduplicating citations by URL
|
|
- Assigning citation numbers
|
|
- Tracking which citations are actually used in the report
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._citations: Dict[str, CitationMetadata] = {} # url -> metadata
|
|
self._citation_order: List[str] = [] # ordered list of URLs
|
|
self._used_citations: set[str] = set() # URLs that are actually cited
|
|
self._url_to_index: Dict[str, int] = {} # url -> index of _citation_order (O(1) lookup)
|
|
|
|
def add_from_search_results(
|
|
self, results: List[Dict[str, Any]], query: str = ""
|
|
) -> List[CitationMetadata]:
|
|
"""
|
|
Add citations from search results.
|
|
|
|
Args:
|
|
results: List of search result dictionaries
|
|
query: The search query that produced these results
|
|
|
|
Returns:
|
|
List of CitationMetadata objects that were added
|
|
"""
|
|
added = []
|
|
for result in results:
|
|
# Skip image results
|
|
if result.get("type") == "image_url":
|
|
continue
|
|
|
|
url = result.get("url")
|
|
if not url:
|
|
continue
|
|
|
|
# Create or update citation metadata
|
|
metadata = CitationMetadata.from_search_result(result, query)
|
|
|
|
if url not in self._citations:
|
|
self._citations[url] = metadata
|
|
self._citation_order.append(url)
|
|
self._url_to_index[url] = len(self._citation_order) - 1
|
|
added.append(metadata)
|
|
logger.debug(f"Added citation: {metadata.title} ({url})")
|
|
else:
|
|
# Update with potentially better metadata
|
|
existing = self._citations[url]
|
|
if metadata.relevance_score > existing.relevance_score:
|
|
self._citations[url] = metadata
|
|
logger.debug(f"Updated citation: {metadata.title} ({url})")
|
|
|
|
return added
|
|
|
|
def add_from_crawl_result(
|
|
self, url: str, title: str, content: Optional[str] = None, **extra_metadata
|
|
) -> CitationMetadata:
|
|
"""
|
|
Add or update a citation from a crawled page.
|
|
|
|
Args:
|
|
url: The URL of the crawled page
|
|
title: The page title
|
|
content: The page content
|
|
**extra_metadata: Additional metadata fields
|
|
|
|
Returns:
|
|
The CitationMetadata object
|
|
"""
|
|
if url in self._citations:
|
|
# Update existing citation with crawled content
|
|
metadata = self._citations[url]
|
|
if title and title != "Untitled":
|
|
metadata.title = title
|
|
if content:
|
|
metadata.raw_content = content
|
|
if not metadata.content_snippet:
|
|
metadata.content_snippet = content[:500]
|
|
else:
|
|
# Create new citation
|
|
metadata = CitationMetadata(
|
|
url=url,
|
|
title=title or "Untitled",
|
|
content_snippet=content[:500] if content else None,
|
|
raw_content=content,
|
|
**extra_metadata,
|
|
)
|
|
self._citations[url] = metadata
|
|
self._citation_order.append(url)
|
|
self._url_to_index[url] = len(self._citation_order) - 1
|
|
|
|
return metadata
|
|
|
|
def mark_used(self, url: str) -> Optional[int]:
|
|
"""
|
|
Mark a citation as used and return its number.
|
|
|
|
Args:
|
|
url: The URL of the citation
|
|
|
|
Returns:
|
|
The citation number (1-indexed) or None if not found
|
|
"""
|
|
if url in self._citations:
|
|
self._used_citations.add(url)
|
|
return self.get_number(url)
|
|
return None
|
|
|
|
def get_number(self, url: str) -> Optional[int]:
|
|
"""
|
|
Get the citation number for a URL (O(1) time complexity).
|
|
|
|
Args:
|
|
url: The URL to look up
|
|
|
|
Returns:
|
|
The citation number (1-indexed) or None if not found
|
|
"""
|
|
index = self._url_to_index.get(url)
|
|
return index + 1 if index is not None else None
|
|
|
|
def get_metadata(self, url: str) -> Optional[CitationMetadata]:
|
|
"""
|
|
Get the metadata for a URL.
|
|
|
|
Args:
|
|
url: The URL to look up
|
|
|
|
Returns:
|
|
The CitationMetadata or None if not found
|
|
"""
|
|
return self._citations.get(url)
|
|
|
|
def get_all_citations(self) -> List[Citation]:
|
|
"""
|
|
Get all collected citations in order.
|
|
|
|
Returns:
|
|
List of Citation objects
|
|
"""
|
|
citations = []
|
|
for i, url in enumerate(self._citation_order):
|
|
metadata = self._citations[url]
|
|
citations.append(
|
|
Citation(
|
|
number=i + 1,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
return citations
|
|
|
|
def get_used_citations(self) -> List[Citation]:
|
|
"""
|
|
Get only the citations that have been marked as used.
|
|
|
|
Returns:
|
|
List of Citation objects that are actually used
|
|
"""
|
|
citations = []
|
|
number = 1
|
|
for url in self._citation_order:
|
|
if url in self._used_citations:
|
|
metadata = self._citations[url]
|
|
citations.append(
|
|
Citation(
|
|
number=number,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
number += 1
|
|
return citations
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""
|
|
Serialize the collector state to a dictionary.
|
|
|
|
Returns:
|
|
Dictionary representation of the collector
|
|
"""
|
|
return {
|
|
"citations": [c.to_dict() for c in self.get_all_citations()],
|
|
"used_urls": list(self._used_citations),
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> "CitationCollector":
|
|
"""
|
|
Deserialize a collector from a dictionary.
|
|
|
|
Args:
|
|
data: Dictionary representation
|
|
|
|
Returns:
|
|
CitationCollector instance
|
|
"""
|
|
collector = cls()
|
|
for citation_data in data.get("citations", []):
|
|
citation = Citation.from_dict(citation_data)
|
|
collector._citations[citation.url] = citation.metadata
|
|
index = len(collector._citation_order)
|
|
collector._citation_order.append(citation.url)
|
|
collector._url_to_index[citation.url] = index
|
|
collector._used_citations = set(data.get("used_urls", []))
|
|
return collector
|
|
|
|
def merge_with(self, other: "CitationCollector") -> None:
|
|
"""
|
|
Merge another collector's citations into this one.
|
|
|
|
Args:
|
|
other: Another CitationCollector to merge
|
|
"""
|
|
for url in other._citation_order:
|
|
if url not in self._citations:
|
|
self._citations[url] = other._citations[url]
|
|
self._citation_order.append(url)
|
|
self._url_to_index[url] = len(self._citation_order) - 1
|
|
self._used_citations.update(other._used_citations)
|
|
|
|
@property
|
|
def count(self) -> int:
|
|
"""Return the total number of citations."""
|
|
return len(self._citations)
|
|
|
|
@property
|
|
def used_count(self) -> int:
|
|
"""Return the number of used citations."""
|
|
return len(self._used_citations)
|
|
|
|
def clear(self) -> None:
|
|
"""Clear all citations."""
|
|
self._citations.clear()
|
|
self._citation_order.clear()
|
|
self._used_citations.clear()
|
|
self._url_to_index.clear()
|
|
|
|
|
|
def extract_urls_from_text(text: str) -> List[str]:
|
|
"""
|
|
Extract URLs from markdown text.
|
|
|
|
Args:
|
|
text: Markdown text that may contain URLs
|
|
|
|
Returns:
|
|
List of URLs found in the text
|
|
"""
|
|
import re
|
|
|
|
urls = []
|
|
|
|
# Match markdown links: [text](url)
|
|
markdown_pattern = r"\[([^\]]+)\]\(([^)]+)\)"
|
|
for match in re.finditer(markdown_pattern, text):
|
|
url = match.group(2)
|
|
if url.startswith(("http://", "https://")):
|
|
urls.append(url)
|
|
|
|
# Match bare URLs
|
|
bare_url_pattern = r"(?<![\(\[])(https?://[^\s\)>\]]+)"
|
|
for match in re.finditer(bare_url_pattern, text):
|
|
url = match.group(1)
|
|
if url not in urls:
|
|
urls.append(url)
|
|
|
|
return urls
|