mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-17 19:44:45 +08:00
chore : Improved citation system (#834)
* improve: Improved citation system * fix --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
@@ -28,6 +28,7 @@ class CitationCollector:
|
||||
self._citations: Dict[str, CitationMetadata] = {} # url -> metadata
|
||||
self._citation_order: List[str] = [] # ordered list of URLs
|
||||
self._used_citations: set[str] = set() # URLs that are actually cited
|
||||
self._url_to_index: Dict[str, int] = {} # url -> index of _citation_order (O(1) lookup)
|
||||
|
||||
def add_from_search_results(
|
||||
self, results: List[Dict[str, Any]], query: str = ""
|
||||
@@ -58,6 +59,7 @@ class CitationCollector:
|
||||
if url not in self._citations:
|
||||
self._citations[url] = metadata
|
||||
self._citation_order.append(url)
|
||||
self._url_to_index[url] = len(self._citation_order) - 1
|
||||
added.append(metadata)
|
||||
logger.debug(f"Added citation: {metadata.title} ({url})")
|
||||
else:
|
||||
@@ -104,6 +106,7 @@ class CitationCollector:
|
||||
)
|
||||
self._citations[url] = metadata
|
||||
self._citation_order.append(url)
|
||||
self._url_to_index[url] = len(self._citation_order) - 1
|
||||
|
||||
return metadata
|
||||
|
||||
@@ -124,7 +127,7 @@ class CitationCollector:
|
||||
|
||||
def get_number(self, url: str) -> Optional[int]:
|
||||
"""
|
||||
Get the citation number for a URL.
|
||||
Get the citation number for a URL (O(1) time complexity).
|
||||
|
||||
Args:
|
||||
url: The URL to look up
|
||||
@@ -132,10 +135,8 @@ class CitationCollector:
|
||||
Returns:
|
||||
The citation number (1-indexed) or None if not found
|
||||
"""
|
||||
try:
|
||||
return self._citation_order.index(url) + 1
|
||||
except ValueError:
|
||||
return None
|
||||
index = self._url_to_index.get(url)
|
||||
return index + 1 if index is not None else None
|
||||
|
||||
def get_metadata(self, url: str) -> Optional[CitationMetadata]:
|
||||
"""
|
||||
@@ -215,7 +216,9 @@ class CitationCollector:
|
||||
for citation_data in data.get("citations", []):
|
||||
citation = Citation.from_dict(citation_data)
|
||||
collector._citations[citation.url] = citation.metadata
|
||||
index = len(collector._citation_order)
|
||||
collector._citation_order.append(citation.url)
|
||||
collector._url_to_index[citation.url] = index
|
||||
collector._used_citations = set(data.get("used_urls", []))
|
||||
return collector
|
||||
|
||||
@@ -230,6 +233,7 @@ class CitationCollector:
|
||||
if url not in self._citations:
|
||||
self._citations[url] = other._citations[url]
|
||||
self._citation_order.append(url)
|
||||
self._url_to_index[url] = len(self._citation_order) - 1
|
||||
self._used_citations.update(other._used_citations)
|
||||
|
||||
@property
|
||||
@@ -247,6 +251,7 @@ class CitationCollector:
|
||||
self._citations.clear()
|
||||
self._citation_order.clear()
|
||||
self._used_citations.clear()
|
||||
self._url_to_index.clear()
|
||||
|
||||
|
||||
def extract_urls_from_text(text: str) -> List[str]:
|
||||
|
||||
@@ -7,6 +7,7 @@ Citation extraction utilities for extracting citations from tool results.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain_core.messages import AIMessage, ToolMessage
|
||||
@@ -205,6 +206,84 @@ def _result_to_citation(result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
}
|
||||
|
||||
|
||||
def extract_title_from_content(content: Optional[str], max_length: int = 200) -> str:
|
||||
"""
|
||||
Intelligent title extraction supporting multiple formats.
|
||||
|
||||
Priority:
|
||||
1. HTML <title> tag
|
||||
2. Markdown h1 (# Title)
|
||||
3. Markdown h2-h6 (## Title, etc.)
|
||||
4. JSON/YAML title field
|
||||
5. First substantial non-empty line
|
||||
6. "Untitled" as fallback
|
||||
|
||||
Args:
|
||||
content: The content to extract title from (can be None)
|
||||
max_length: Maximum title length (default: 200)
|
||||
|
||||
Returns:
|
||||
Extracted title or "Untitled"
|
||||
"""
|
||||
if not content:
|
||||
return "Untitled"
|
||||
|
||||
# 1. Try HTML title tag
|
||||
html_title_match = re.search(
|
||||
r'<title[^>]*>([^<]+)</title>',
|
||||
content,
|
||||
re.IGNORECASE | re.DOTALL
|
||||
)
|
||||
if html_title_match:
|
||||
title = html_title_match.group(1).strip()
|
||||
if title:
|
||||
return title[:max_length]
|
||||
|
||||
# 2. Try Markdown h1 (exact match of only one #)
|
||||
md_h1_match = re.search(
|
||||
r'^#{1}\s+(.+?)$',
|
||||
content,
|
||||
re.MULTILINE
|
||||
)
|
||||
if md_h1_match:
|
||||
title = md_h1_match.group(1).strip()
|
||||
if title:
|
||||
return title[:max_length]
|
||||
|
||||
# 3. Try any Markdown heading (h2-h6)
|
||||
md_heading_match = re.search(
|
||||
r'^#{2,6}\s+(.+?)$',
|
||||
content,
|
||||
re.MULTILINE
|
||||
)
|
||||
if md_heading_match:
|
||||
title = md_heading_match.group(1).strip()
|
||||
if title:
|
||||
return title[:max_length]
|
||||
|
||||
# 4. Try JSON/YAML title field
|
||||
json_title_match = re.search(
|
||||
r'"?title"?\s*:\s*["\']?([^"\'\n]+)["\']?',
|
||||
content,
|
||||
re.IGNORECASE
|
||||
)
|
||||
if json_title_match:
|
||||
title = json_title_match.group(1).strip()
|
||||
if title and len(title) > 3:
|
||||
return title[:max_length]
|
||||
|
||||
# 5. First substantial non-empty line
|
||||
for line in content.split('\n'):
|
||||
line = line.strip()
|
||||
# Skip short lines, code blocks, list items, and separators
|
||||
if (line and
|
||||
len(line) > 10 and
|
||||
not line.startswith(('```', '---', '***', '- ', '* ', '+ ', '#'))):
|
||||
return line[:max_length]
|
||||
|
||||
return "Untitled"
|
||||
|
||||
|
||||
def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Extract citation from crawl tool result.
|
||||
@@ -224,18 +303,8 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
|
||||
|
||||
content = data.get("crawled_content", "")
|
||||
|
||||
# Try to extract title from content (first h1 or first line)
|
||||
title = "Untitled"
|
||||
if content:
|
||||
lines = content.strip().split("\n")
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if line.startswith("# "):
|
||||
title = line[2:].strip()
|
||||
break
|
||||
elif line and not line.startswith("#"):
|
||||
title = line[:100]
|
||||
break
|
||||
# Extract title using intelligent extraction function
|
||||
title = extract_title_from_content(content)
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
@@ -248,15 +317,48 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
|
||||
}
|
||||
|
||||
|
||||
def _extract_domain(url: str) -> str:
|
||||
"""Extract domain from URL."""
|
||||
def _extract_domain(url: Optional[str]) -> str:
|
||||
"""
|
||||
Extract domain from URL using urllib with regex fallback.
|
||||
|
||||
Handles:
|
||||
- Standard URLs: https://www.example.com/path
|
||||
- Short URLs: example.com
|
||||
- Invalid URLs: graceful fallback
|
||||
|
||||
Args:
|
||||
url: The URL string to extract domain from (can be None)
|
||||
|
||||
Returns:
|
||||
The domain netloc (including port if present), or empty string if extraction fails
|
||||
"""
|
||||
if not url:
|
||||
return ""
|
||||
|
||||
# Approach 1: Try urllib first (fast path for standard URLs)
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
parsed = urlparse(url)
|
||||
return parsed.netloc
|
||||
except Exception:
|
||||
return ""
|
||||
if parsed.netloc:
|
||||
return parsed.netloc
|
||||
except Exception as e:
|
||||
logger.debug(f"URL parsing failed for {url}: {e}")
|
||||
|
||||
# Approach 2: Regex fallback (for non-standard or bare URLs without scheme)
|
||||
# Matches: domain[:port] where domain is a valid hostname
|
||||
# Pattern breakdown:
|
||||
# ([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)
|
||||
# - domain labels separated by dots, each 1-63 chars, starting/ending with alphanumeric
|
||||
# (?::\d+)? - optional port
|
||||
pattern = r'^([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*(?::\d+)?)(?:[/?#]|$)'
|
||||
|
||||
match = re.match(pattern, url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
logger.warning(f"Could not extract domain from URL: {url}")
|
||||
return ""
|
||||
|
||||
|
||||
def merge_citations(
|
||||
|
||||
@@ -6,9 +6,9 @@ Citation formatter for generating citation sections and inline references.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Tuple
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from .models import Citation, CitationMetadata
|
||||
from .models import Citation
|
||||
|
||||
|
||||
class CitationFormatter:
|
||||
@@ -239,33 +239,159 @@ class CitationFormatter:
|
||||
return json.dumps(data, ensure_ascii=False)
|
||||
|
||||
|
||||
def parse_citations_from_report(report: str) -> List[Tuple[str, str]]:
|
||||
def parse_citations_from_report(
|
||||
report: str, section_patterns: List[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse citation links from a report's Key Citations section.
|
||||
|
||||
Extract citation information from report, supporting multiple formats.
|
||||
|
||||
Supports various citation formats:
|
||||
- Markdown: [Title](URL)
|
||||
- Numbered: [1] Title - URL
|
||||
- Footnote: [^1]: Title - URL
|
||||
- HTML: <a href="URL">Title</a>
|
||||
|
||||
Args:
|
||||
report: The report markdown text
|
||||
|
||||
section_patterns: Custom section header patterns (optional)
|
||||
|
||||
Returns:
|
||||
List of (title, url) tuples
|
||||
Dictionary with 'citations' list and 'count' of unique citations
|
||||
"""
|
||||
if section_patterns is None:
|
||||
section_patterns = [
|
||||
r"(?:##\s*Key Citations|##\s*References|##\s*Sources|##\s*Bibliography)",
|
||||
]
|
||||
|
||||
citations = []
|
||||
|
||||
# 1. Find citation section and extract citations
|
||||
for pattern in section_patterns:
|
||||
# Use a more efficient pattern that matches line-by-line content
|
||||
# instead of relying on dotall with greedy matching for large reports
|
||||
section_matches = re.finditer(
|
||||
pattern + r"\s*\n((?:(?!\n##).*\n?)*)",
|
||||
report,
|
||||
re.IGNORECASE | re.MULTILINE,
|
||||
)
|
||||
|
||||
for section_match in section_matches:
|
||||
section = section_match.group(1)
|
||||
|
||||
# 2. Extract citations in various formats
|
||||
citations.extend(_extract_markdown_links(section))
|
||||
citations.extend(_extract_numbered_citations(section))
|
||||
citations.extend(_extract_footnote_citations(section))
|
||||
citations.extend(_extract_html_links(section))
|
||||
|
||||
# 3. Deduplicate by URL
|
||||
unique_citations = {}
|
||||
for citation in citations:
|
||||
url = citation.get("url", "")
|
||||
if url and url not in unique_citations:
|
||||
unique_citations[url] = citation
|
||||
|
||||
return {
|
||||
"citations": list(unique_citations.values()),
|
||||
"count": len(unique_citations),
|
||||
}
|
||||
|
||||
|
||||
def _extract_markdown_links(text: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Extract Markdown links [title](url).
|
||||
|
||||
Args:
|
||||
text: Text to extract from
|
||||
|
||||
Returns:
|
||||
List of citation dictionaries with title, url, and format
|
||||
"""
|
||||
citations = []
|
||||
|
||||
# Find the Key Citations section
|
||||
section_pattern = (
|
||||
r"(?:##\s*Key Citations|##\s*References|##\s*Sources)\s*\n(.*?)(?=\n##|\Z)"
|
||||
)
|
||||
section_match = re.search(section_pattern, report, re.IGNORECASE | re.DOTALL)
|
||||
|
||||
if section_match:
|
||||
section = section_match.group(1)
|
||||
|
||||
# Extract markdown links
|
||||
link_pattern = r"\[([^\]]+)\]\(([^)]+)\)"
|
||||
for match in re.finditer(link_pattern, section):
|
||||
title = match.group(1)
|
||||
url = match.group(2)
|
||||
if url.startswith(("http://", "https://")):
|
||||
citations.append((title, url))
|
||||
|
||||
pattern = r"\[([^\]]+)\]\(([^)]+)\)"
|
||||
|
||||
for match in re.finditer(pattern, text):
|
||||
title, url = match.groups()
|
||||
if url.startswith(("http://", "https://")):
|
||||
citations.append({
|
||||
"title": title.strip(),
|
||||
"url": url.strip(),
|
||||
"format": "markdown",
|
||||
})
|
||||
|
||||
return citations
|
||||
|
||||
|
||||
def _extract_numbered_citations(text: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Extract numbered citations [1] Title - URL.
|
||||
|
||||
Args:
|
||||
text: Text to extract from
|
||||
|
||||
Returns:
|
||||
List of citation dictionaries
|
||||
"""
|
||||
citations = []
|
||||
# Match: [number] title - URL
|
||||
pattern = r"\[\d+\]\s+([^-\n]+?)\s*-\s*(https?://[^\s\n]+)"
|
||||
|
||||
for match in re.finditer(pattern, text):
|
||||
title, url = match.groups()
|
||||
citations.append({
|
||||
"title": title.strip(),
|
||||
"url": url.strip(),
|
||||
"format": "numbered",
|
||||
})
|
||||
|
||||
return citations
|
||||
|
||||
|
||||
def _extract_footnote_citations(text: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Extract footnote citations [^1]: Title - URL.
|
||||
|
||||
Args:
|
||||
text: Text to extract from
|
||||
|
||||
Returns:
|
||||
List of citation dictionaries
|
||||
"""
|
||||
citations = []
|
||||
# Match: [^number]: title - URL
|
||||
pattern = r"\[\^(\d+)\]:\s+([^-\n]+?)\s*-\s*(https?://[^\s\n]+)"
|
||||
|
||||
for match in re.finditer(pattern, text):
|
||||
_, title, url = match.groups()
|
||||
citations.append({
|
||||
"title": title.strip(),
|
||||
"url": url.strip(),
|
||||
"format": "footnote",
|
||||
})
|
||||
|
||||
return citations
|
||||
|
||||
|
||||
def _extract_html_links(text: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Extract HTML links <a href="url">title</a>.
|
||||
|
||||
Args:
|
||||
text: Text to extract from
|
||||
|
||||
Returns:
|
||||
List of citation dictionaries
|
||||
"""
|
||||
citations = []
|
||||
pattern = r'<a\s+(?:[^>]*?\s)?href=(["\'])([^"\']+)\1[^>]*>([^<]+)</a>'
|
||||
|
||||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||||
_, url, title = match.groups()
|
||||
if url.startswith(("http://", "https://")):
|
||||
citations.append({
|
||||
"title": title.strip(),
|
||||
"url": url.strip(),
|
||||
"format": "html",
|
||||
})
|
||||
|
||||
return citations
|
||||
|
||||
@@ -6,14 +6,14 @@ Citation data models for structured source metadata.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
@dataclass
|
||||
class CitationMetadata:
|
||||
|
||||
class CitationMetadata(BaseModel):
|
||||
"""Metadata extracted from a source."""
|
||||
|
||||
# Core identifiers
|
||||
@@ -32,7 +32,7 @@ class CitationMetadata:
|
||||
language: Optional[str] = None
|
||||
|
||||
# Media
|
||||
images: List[str] = field(default_factory=list)
|
||||
images: List[str] = Field(default_factory=list)
|
||||
favicon: Optional[str] = None
|
||||
|
||||
# Quality indicators
|
||||
@@ -40,13 +40,16 @@ class CitationMetadata:
|
||||
credibility_score: float = 0.0
|
||||
|
||||
# Timestamps
|
||||
accessed_at: str = field(default_factory=lambda: datetime.now().isoformat())
|
||||
accessed_at: str = Field(default_factory=lambda: datetime.now().isoformat())
|
||||
|
||||
# Additional metadata
|
||||
extra: Dict[str, Any] = field(default_factory=dict)
|
||||
extra: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
def __post_init__(self):
|
||||
"""Extract domain from URL if not provided."""
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
def __init__(self, **data):
|
||||
"""Initialize and extract domain from URL if not provided."""
|
||||
super().__init__(**data)
|
||||
if not self.domain and self.url:
|
||||
try:
|
||||
parsed = urlparse(self.url)
|
||||
@@ -87,7 +90,7 @@ class CitationMetadata:
|
||||
"""Create from dictionary."""
|
||||
# Remove 'id' as it's computed from url
|
||||
data = {k: v for k, v in data.items() if k != "id"}
|
||||
return cls(**data)
|
||||
return cls.model_validate(data)
|
||||
|
||||
@classmethod
|
||||
def from_search_result(
|
||||
@@ -107,8 +110,8 @@ class CitationMetadata:
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Citation:
|
||||
|
||||
class Citation(BaseModel):
|
||||
"""
|
||||
A citation reference that can be used in reports.
|
||||
|
||||
@@ -127,6 +130,8 @@ class Citation:
|
||||
# Specific quote or fact being cited
|
||||
cited_text: Optional[str] = None
|
||||
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
@property
|
||||
def id(self) -> str:
|
||||
"""Get the citation ID from metadata."""
|
||||
@@ -154,12 +159,14 @@ class Citation:
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "Citation":
|
||||
"""Create from dictionary."""
|
||||
return cls(
|
||||
number=data["number"],
|
||||
metadata=CitationMetadata.from_dict(data["metadata"]),
|
||||
context=data.get("context"),
|
||||
cited_text=data.get("cited_text"),
|
||||
)
|
||||
return cls.model_validate({
|
||||
"number": data["number"],
|
||||
"metadata": CitationMetadata.from_dict(data["metadata"])
|
||||
if isinstance(data.get("metadata"), dict)
|
||||
else data["metadata"],
|
||||
"context": data.get("context"),
|
||||
"cited_text": data.get("cited_text"),
|
||||
})
|
||||
|
||||
def to_markdown_reference(self) -> str:
|
||||
"""Generate markdown reference format: [Title](URL)"""
|
||||
|
||||
Reference in New Issue
Block a user