chore : Improved citation system (#834)

* improve: Improved citation system

* fix

---------

Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
Xun
2026-01-25 15:49:45 +08:00
committed by GitHub
parent 31624b64b8
commit 9a34e32252
8 changed files with 1735 additions and 65 deletions

View File

@@ -28,6 +28,7 @@ class CitationCollector:
self._citations: Dict[str, CitationMetadata] = {} # url -> metadata
self._citation_order: List[str] = [] # ordered list of URLs
self._used_citations: set[str] = set() # URLs that are actually cited
self._url_to_index: Dict[str, int] = {} # url -> index of _citation_order (O(1) lookup)
def add_from_search_results(
self, results: List[Dict[str, Any]], query: str = ""
@@ -58,6 +59,7 @@ class CitationCollector:
if url not in self._citations:
self._citations[url] = metadata
self._citation_order.append(url)
self._url_to_index[url] = len(self._citation_order) - 1
added.append(metadata)
logger.debug(f"Added citation: {metadata.title} ({url})")
else:
@@ -104,6 +106,7 @@ class CitationCollector:
)
self._citations[url] = metadata
self._citation_order.append(url)
self._url_to_index[url] = len(self._citation_order) - 1
return metadata
@@ -124,7 +127,7 @@ class CitationCollector:
def get_number(self, url: str) -> Optional[int]:
"""
Get the citation number for a URL.
Get the citation number for a URL (O(1) time complexity).
Args:
url: The URL to look up
@@ -132,10 +135,8 @@ class CitationCollector:
Returns:
The citation number (1-indexed) or None if not found
"""
try:
return self._citation_order.index(url) + 1
except ValueError:
return None
index = self._url_to_index.get(url)
return index + 1 if index is not None else None
def get_metadata(self, url: str) -> Optional[CitationMetadata]:
"""
@@ -215,7 +216,9 @@ class CitationCollector:
for citation_data in data.get("citations", []):
citation = Citation.from_dict(citation_data)
collector._citations[citation.url] = citation.metadata
index = len(collector._citation_order)
collector._citation_order.append(citation.url)
collector._url_to_index[citation.url] = index
collector._used_citations = set(data.get("used_urls", []))
return collector
@@ -230,6 +233,7 @@ class CitationCollector:
if url not in self._citations:
self._citations[url] = other._citations[url]
self._citation_order.append(url)
self._url_to_index[url] = len(self._citation_order) - 1
self._used_citations.update(other._used_citations)
@property
@@ -247,6 +251,7 @@ class CitationCollector:
self._citations.clear()
self._citation_order.clear()
self._used_citations.clear()
self._url_to_index.clear()
def extract_urls_from_text(text: str) -> List[str]:

View File

@@ -7,6 +7,7 @@ Citation extraction utilities for extracting citations from tool results.
import json
import logging
import re
from typing import Any, Dict, List, Optional
from langchain_core.messages import AIMessage, ToolMessage
@@ -205,6 +206,84 @@ def _result_to_citation(result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
}
def extract_title_from_content(content: Optional[str], max_length: int = 200) -> str:
"""
Intelligent title extraction supporting multiple formats.
Priority:
1. HTML <title> tag
2. Markdown h1 (# Title)
3. Markdown h2-h6 (## Title, etc.)
4. JSON/YAML title field
5. First substantial non-empty line
6. "Untitled" as fallback
Args:
content: The content to extract title from (can be None)
max_length: Maximum title length (default: 200)
Returns:
Extracted title or "Untitled"
"""
if not content:
return "Untitled"
# 1. Try HTML title tag
html_title_match = re.search(
r'<title[^>]*>([^<]+)</title>',
content,
re.IGNORECASE | re.DOTALL
)
if html_title_match:
title = html_title_match.group(1).strip()
if title:
return title[:max_length]
# 2. Try Markdown h1 (exact match of only one #)
md_h1_match = re.search(
r'^#{1}\s+(.+?)$',
content,
re.MULTILINE
)
if md_h1_match:
title = md_h1_match.group(1).strip()
if title:
return title[:max_length]
# 3. Try any Markdown heading (h2-h6)
md_heading_match = re.search(
r'^#{2,6}\s+(.+?)$',
content,
re.MULTILINE
)
if md_heading_match:
title = md_heading_match.group(1).strip()
if title:
return title[:max_length]
# 4. Try JSON/YAML title field
json_title_match = re.search(
r'"?title"?\s*:\s*["\']?([^"\'\n]+)["\']?',
content,
re.IGNORECASE
)
if json_title_match:
title = json_title_match.group(1).strip()
if title and len(title) > 3:
return title[:max_length]
# 5. First substantial non-empty line
for line in content.split('\n'):
line = line.strip()
# Skip short lines, code blocks, list items, and separators
if (line and
len(line) > 10 and
not line.startswith(('```', '---', '***', '- ', '* ', '+ ', '#'))):
return line[:max_length]
return "Untitled"
def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
"""
Extract citation from crawl tool result.
@@ -224,18 +303,8 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
content = data.get("crawled_content", "")
# Try to extract title from content (first h1 or first line)
title = "Untitled"
if content:
lines = content.strip().split("\n")
for line in lines:
line = line.strip()
if line.startswith("# "):
title = line[2:].strip()
break
elif line and not line.startswith("#"):
title = line[:100]
break
# Extract title using intelligent extraction function
title = extract_title_from_content(content)
return {
"url": url,
@@ -248,15 +317,48 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
}
def _extract_domain(url: str) -> str:
"""Extract domain from URL."""
def _extract_domain(url: Optional[str]) -> str:
"""
Extract domain from URL using urllib with regex fallback.
Handles:
- Standard URLs: https://www.example.com/path
- Short URLs: example.com
- Invalid URLs: graceful fallback
Args:
url: The URL string to extract domain from (can be None)
Returns:
The domain netloc (including port if present), or empty string if extraction fails
"""
if not url:
return ""
# Approach 1: Try urllib first (fast path for standard URLs)
try:
from urllib.parse import urlparse
parsed = urlparse(url)
return parsed.netloc
except Exception:
return ""
if parsed.netloc:
return parsed.netloc
except Exception as e:
logger.debug(f"URL parsing failed for {url}: {e}")
# Approach 2: Regex fallback (for non-standard or bare URLs without scheme)
# Matches: domain[:port] where domain is a valid hostname
# Pattern breakdown:
# ([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)
# - domain labels separated by dots, each 1-63 chars, starting/ending with alphanumeric
# (?::\d+)? - optional port
pattern = r'^([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*(?::\d+)?)(?:[/?#]|$)'
match = re.match(pattern, url)
if match:
return match.group(1)
logger.warning(f"Could not extract domain from URL: {url}")
return ""
def merge_citations(

View File

@@ -6,9 +6,9 @@ Citation formatter for generating citation sections and inline references.
"""
import re
from typing import Dict, List, Tuple
from typing import Any, Dict, List
from .models import Citation, CitationMetadata
from .models import Citation
class CitationFormatter:
@@ -239,33 +239,159 @@ class CitationFormatter:
return json.dumps(data, ensure_ascii=False)
def parse_citations_from_report(report: str) -> List[Tuple[str, str]]:
def parse_citations_from_report(
report: str, section_patterns: List[str] = None
) -> Dict[str, Any]:
"""
Parse citation links from a report's Key Citations section.
Extract citation information from report, supporting multiple formats.
Supports various citation formats:
- Markdown: [Title](URL)
- Numbered: [1] Title - URL
- Footnote: [^1]: Title - URL
- HTML: <a href="URL">Title</a>
Args:
report: The report markdown text
section_patterns: Custom section header patterns (optional)
Returns:
List of (title, url) tuples
Dictionary with 'citations' list and 'count' of unique citations
"""
if section_patterns is None:
section_patterns = [
r"(?:##\s*Key Citations|##\s*References|##\s*Sources|##\s*Bibliography)",
]
citations = []
# 1. Find citation section and extract citations
for pattern in section_patterns:
# Use a more efficient pattern that matches line-by-line content
# instead of relying on dotall with greedy matching for large reports
section_matches = re.finditer(
pattern + r"\s*\n((?:(?!\n##).*\n?)*)",
report,
re.IGNORECASE | re.MULTILINE,
)
for section_match in section_matches:
section = section_match.group(1)
# 2. Extract citations in various formats
citations.extend(_extract_markdown_links(section))
citations.extend(_extract_numbered_citations(section))
citations.extend(_extract_footnote_citations(section))
citations.extend(_extract_html_links(section))
# 3. Deduplicate by URL
unique_citations = {}
for citation in citations:
url = citation.get("url", "")
if url and url not in unique_citations:
unique_citations[url] = citation
return {
"citations": list(unique_citations.values()),
"count": len(unique_citations),
}
def _extract_markdown_links(text: str) -> List[Dict[str, str]]:
"""
Extract Markdown links [title](url).
Args:
text: Text to extract from
Returns:
List of citation dictionaries with title, url, and format
"""
citations = []
# Find the Key Citations section
section_pattern = (
r"(?:##\s*Key Citations|##\s*References|##\s*Sources)\s*\n(.*?)(?=\n##|\Z)"
)
section_match = re.search(section_pattern, report, re.IGNORECASE | re.DOTALL)
if section_match:
section = section_match.group(1)
# Extract markdown links
link_pattern = r"\[([^\]]+)\]\(([^)]+)\)"
for match in re.finditer(link_pattern, section):
title = match.group(1)
url = match.group(2)
if url.startswith(("http://", "https://")):
citations.append((title, url))
pattern = r"\[([^\]]+)\]\(([^)]+)\)"
for match in re.finditer(pattern, text):
title, url = match.groups()
if url.startswith(("http://", "https://")):
citations.append({
"title": title.strip(),
"url": url.strip(),
"format": "markdown",
})
return citations
def _extract_numbered_citations(text: str) -> List[Dict[str, str]]:
"""
Extract numbered citations [1] Title - URL.
Args:
text: Text to extract from
Returns:
List of citation dictionaries
"""
citations = []
# Match: [number] title - URL
pattern = r"\[\d+\]\s+([^-\n]+?)\s*-\s*(https?://[^\s\n]+)"
for match in re.finditer(pattern, text):
title, url = match.groups()
citations.append({
"title": title.strip(),
"url": url.strip(),
"format": "numbered",
})
return citations
def _extract_footnote_citations(text: str) -> List[Dict[str, str]]:
"""
Extract footnote citations [^1]: Title - URL.
Args:
text: Text to extract from
Returns:
List of citation dictionaries
"""
citations = []
# Match: [^number]: title - URL
pattern = r"\[\^(\d+)\]:\s+([^-\n]+?)\s*-\s*(https?://[^\s\n]+)"
for match in re.finditer(pattern, text):
_, title, url = match.groups()
citations.append({
"title": title.strip(),
"url": url.strip(),
"format": "footnote",
})
return citations
def _extract_html_links(text: str) -> List[Dict[str, str]]:
"""
Extract HTML links <a href="url">title</a>.
Args:
text: Text to extract from
Returns:
List of citation dictionaries
"""
citations = []
pattern = r'<a\s+(?:[^>]*?\s)?href=(["\'])([^"\']+)\1[^>]*>([^<]+)</a>'
for match in re.finditer(pattern, text, re.IGNORECASE):
_, url, title = match.groups()
if url.startswith(("http://", "https://")):
citations.append({
"title": title.strip(),
"url": url.strip(),
"format": "html",
})
return citations

View File

@@ -6,14 +6,14 @@ Citation data models for structured source metadata.
"""
import hashlib
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse
from pydantic import BaseModel, ConfigDict, Field
@dataclass
class CitationMetadata:
class CitationMetadata(BaseModel):
"""Metadata extracted from a source."""
# Core identifiers
@@ -32,7 +32,7 @@ class CitationMetadata:
language: Optional[str] = None
# Media
images: List[str] = field(default_factory=list)
images: List[str] = Field(default_factory=list)
favicon: Optional[str] = None
# Quality indicators
@@ -40,13 +40,16 @@ class CitationMetadata:
credibility_score: float = 0.0
# Timestamps
accessed_at: str = field(default_factory=lambda: datetime.now().isoformat())
accessed_at: str = Field(default_factory=lambda: datetime.now().isoformat())
# Additional metadata
extra: Dict[str, Any] = field(default_factory=dict)
extra: Dict[str, Any] = Field(default_factory=dict)
def __post_init__(self):
"""Extract domain from URL if not provided."""
model_config = ConfigDict(arbitrary_types_allowed=True)
def __init__(self, **data):
"""Initialize and extract domain from URL if not provided."""
super().__init__(**data)
if not self.domain and self.url:
try:
parsed = urlparse(self.url)
@@ -87,7 +90,7 @@ class CitationMetadata:
"""Create from dictionary."""
# Remove 'id' as it's computed from url
data = {k: v for k, v in data.items() if k != "id"}
return cls(**data)
return cls.model_validate(data)
@classmethod
def from_search_result(
@@ -107,8 +110,8 @@ class CitationMetadata:
)
@dataclass
class Citation:
class Citation(BaseModel):
"""
A citation reference that can be used in reports.
@@ -127,6 +130,8 @@ class Citation:
# Specific quote or fact being cited
cited_text: Optional[str] = None
model_config = ConfigDict(arbitrary_types_allowed=True)
@property
def id(self) -> str:
"""Get the citation ID from metadata."""
@@ -154,12 +159,14 @@ class Citation:
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "Citation":
"""Create from dictionary."""
return cls(
number=data["number"],
metadata=CitationMetadata.from_dict(data["metadata"]),
context=data.get("context"),
cited_text=data.get("cited_text"),
)
return cls.model_validate({
"number": data["number"],
"metadata": CitationMetadata.from_dict(data["metadata"])
if isinstance(data.get("metadata"), dict)
else data["metadata"],
"context": data.get("context"),
"cited_text": data.get("cited_text"),
})
def to_markdown_reference(self) -> str:
"""Generate markdown reference format: [Title](URL)"""