chore : Improved citation system (#834)

* improve: Improved citation system * fix --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
2026-04-17 19:44:45 +08:00 · 2026-01-25 15:49:45 +08:00
parent 31624b64b8
commit 9a34e32252
8 changed files with 1735 additions and 65 deletions
--- a/src/citations/collector.py
+++ b/src/citations/collector.py
@@ -28,6 +28,7 @@ class CitationCollector:
        self._citations: Dict[str, CitationMetadata] = {}  # url -> metadata
        self._citation_order: List[str] = []  # ordered list of URLs
        self._used_citations: set[str] = set()  # URLs that are actually cited
+        self._url_to_index: Dict[str, int] = {}  # url -> index of _citation_order (O(1) lookup)

    def add_from_search_results(
        self, results: List[Dict[str, Any]], query: str = ""
@@ -58,6 +59,7 @@ class CitationCollector:
            if url not in self._citations:
                self._citations[url] = metadata
                self._citation_order.append(url)
+                self._url_to_index[url] = len(self._citation_order) - 1
                added.append(metadata)
                logger.debug(f"Added citation: {metadata.title} ({url})")
            else:
@@ -104,6 +106,7 @@ class CitationCollector:
            )
            self._citations[url] = metadata
            self._citation_order.append(url)
+            self._url_to_index[url] = len(self._citation_order) - 1

        return metadata

@@ -124,7 +127,7 @@ class CitationCollector:

    def get_number(self, url: str) -> Optional[int]:
        """
-        Get the citation number for a URL.
+        Get the citation number for a URL (O(1) time complexity).

        Args:
            url: The URL to look up
@@ -132,10 +135,8 @@ class CitationCollector:
        Returns:
            The citation number (1-indexed) or None if not found
        """
-        try:
-            return self._citation_order.index(url) + 1
-        except ValueError:
-            return None
+        index = self._url_to_index.get(url)
+        return index + 1 if index is not None else None

    def get_metadata(self, url: str) -> Optional[CitationMetadata]:
        """
@@ -215,7 +216,9 @@ class CitationCollector:
        for citation_data in data.get("citations", []):
            citation = Citation.from_dict(citation_data)
            collector._citations[citation.url] = citation.metadata
+            index = len(collector._citation_order)
            collector._citation_order.append(citation.url)
+            collector._url_to_index[citation.url] = index
        collector._used_citations = set(data.get("used_urls", []))
        return collector

@@ -230,6 +233,7 @@ class CitationCollector:
            if url not in self._citations:
                self._citations[url] = other._citations[url]
                self._citation_order.append(url)
+                self._url_to_index[url] = len(self._citation_order) - 1
        self._used_citations.update(other._used_citations)

    @property
@@ -247,6 +251,7 @@ class CitationCollector:
        self._citations.clear()
        self._citation_order.clear()
        self._used_citations.clear()
+        self._url_to_index.clear()


 def extract_urls_from_text(text: str) -> List[str]:
--- a/src/citations/extractor.py
+++ b/src/citations/extractor.py
@@ -7,6 +7,7 @@ Citation extraction utilities for extracting citations from tool results.

 import json
 import logging
+import re
 from typing import Any, Dict, List, Optional

 from langchain_core.messages import AIMessage, ToolMessage
@@ -205,6 +206,84 @@ def _result_to_citation(result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    }


+def extract_title_from_content(content: Optional[str], max_length: int = 200) -> str:
+    """
+    Intelligent title extraction supporting multiple formats.
+    
+    Priority:
+    1. HTML <title> tag
+    2. Markdown h1 (# Title)
+    3. Markdown h2-h6 (## Title, etc.)
+    4. JSON/YAML title field
+    5. First substantial non-empty line
+    6. "Untitled" as fallback
+    
+    Args:
+        content: The content to extract title from (can be None)
+        max_length: Maximum title length (default: 200)
+    
+    Returns:
+        Extracted title or "Untitled"
+    """
+    if not content:
+        return "Untitled"
+    
+    # 1. Try HTML title tag
+    html_title_match = re.search(
+        r'<title[^>]*>([^<]+)</title>',
+        content,
+        re.IGNORECASE | re.DOTALL
+    )
+    if html_title_match:
+        title = html_title_match.group(1).strip()
+        if title:
+            return title[:max_length]
+    
+    # 2. Try Markdown h1 (exact match of only one #)
+    md_h1_match = re.search(
+        r'^#{1}\s+(.+?)$',
+        content,
+        re.MULTILINE
+    )
+    if md_h1_match:
+        title = md_h1_match.group(1).strip()
+        if title:
+            return title[:max_length]
+    
+    # 3. Try any Markdown heading (h2-h6)
+    md_heading_match = re.search(
+        r'^#{2,6}\s+(.+?)$',
+        content,
+        re.MULTILINE
+    )
+    if md_heading_match:
+        title = md_heading_match.group(1).strip()
+        if title:
+            return title[:max_length]
+    
+    # 4. Try JSON/YAML title field
+    json_title_match = re.search(
+        r'"?title"?\s*:\s*["\']?([^"\'\n]+)["\']?',
+        content,
+        re.IGNORECASE
+    )
+    if json_title_match:
+        title = json_title_match.group(1).strip()
+        if title and len(title) > 3:
+            return title[:max_length]
+    
+    # 5. First substantial non-empty line
+    for line in content.split('\n'):
+        line = line.strip()
+        # Skip short lines, code blocks, list items, and separators
+        if (line and 
+            len(line) > 10 and 
+            not line.startswith(('```', '---', '***', '- ', '* ', '+ ', '#'))):
+            return line[:max_length]
+    
+    return "Untitled"
+
+
 def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
    """
    Extract citation from crawl tool result.
@@ -224,18 +303,8 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:

    content = data.get("crawled_content", "")

-    # Try to extract title from content (first h1 or first line)
-    title = "Untitled"
-    if content:
-        lines = content.strip().split("\n")
-        for line in lines:
-            line = line.strip()
-            if line.startswith("# "):
-                title = line[2:].strip()
-                break
-            elif line and not line.startswith("#"):
-                title = line[:100]
-                break
+    # Extract title using intelligent extraction function
+    title = extract_title_from_content(content)

    return {
        "url": url,
@@ -248,15 +317,48 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
    }


-def _extract_domain(url: str) -> str:
-    """Extract domain from URL."""
+def _extract_domain(url: Optional[str]) -> str:
+    """
+    Extract domain from URL using urllib with regex fallback.
+    
+    Handles:
+    - Standard URLs: https://www.example.com/path
+    - Short URLs: example.com
+    - Invalid URLs: graceful fallback
+    
+    Args:
+        url: The URL string to extract domain from (can be None)
+    
+    Returns:
+        The domain netloc (including port if present), or empty string if extraction fails
+    """
+    if not url:
+        return ""
+    
+    # Approach 1: Try urllib first (fast path for standard URLs)
    try:
        from urllib.parse import urlparse
-
+        
        parsed = urlparse(url)
-        return parsed.netloc
-    except Exception:
-        return ""
+        if parsed.netloc:
+            return parsed.netloc
+    except Exception as e:
+        logger.debug(f"URL parsing failed for {url}: {e}")
+    
+    # Approach 2: Regex fallback (for non-standard or bare URLs without scheme)
+    # Matches: domain[:port] where domain is a valid hostname
+    # Pattern breakdown:
+    # ([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)
+    # - domain labels separated by dots, each 1-63 chars, starting/ending with alphanumeric
+    # (?::\d+)? - optional port
+    pattern = r'^([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*(?::\d+)?)(?:[/?#]|$)'
+    
+    match = re.match(pattern, url)
+    if match:
+        return match.group(1)
+    
+    logger.warning(f"Could not extract domain from URL: {url}")
+    return ""


 def merge_citations(
--- a/src/citations/formatter.py
+++ b/src/citations/formatter.py
@@ -6,9 +6,9 @@ Citation formatter for generating citation sections and inline references.
 """

 import re
-from typing import Dict, List, Tuple
+from typing import Any, Dict, List

-from .models import Citation, CitationMetadata
+from .models import Citation


 class CitationFormatter:
@@ -239,33 +239,159 @@ class CitationFormatter:
        return json.dumps(data, ensure_ascii=False)


-def parse_citations_from_report(report: str) -> List[Tuple[str, str]]:
+def parse_citations_from_report(
+    report: str, section_patterns: List[str] = None
+) -> Dict[str, Any]:
    """
-    Parse citation links from a report's Key Citations section.
-
+    Extract citation information from report, supporting multiple formats.
+    
+    Supports various citation formats:
+    - Markdown: [Title](URL)
+    - Numbered: [1] Title - URL
+    - Footnote: [^1]: Title - URL
+    - HTML: <a href="URL">Title</a>
+    
    Args:
        report: The report markdown text
-
+        section_patterns: Custom section header patterns (optional)
+    
    Returns:
-        List of (title, url) tuples
+        Dictionary with 'citations' list and 'count' of unique citations
+    """
+    if section_patterns is None:
+        section_patterns = [
+            r"(?:##\s*Key Citations|##\s*References|##\s*Sources|##\s*Bibliography)",
+        ]
+    
+    citations = []
+    
+    # 1. Find citation section and extract citations
+    for pattern in section_patterns:
+        # Use a more efficient pattern that matches line-by-line content
+        # instead of relying on dotall with greedy matching for large reports
+        section_matches = re.finditer(
+            pattern + r"\s*\n((?:(?!\n##).*\n?)*)",
+            report,
+            re.IGNORECASE | re.MULTILINE,
+        )
+        
+        for section_match in section_matches:
+            section = section_match.group(1)
+            
+            # 2. Extract citations in various formats
+            citations.extend(_extract_markdown_links(section))
+            citations.extend(_extract_numbered_citations(section))
+            citations.extend(_extract_footnote_citations(section))
+            citations.extend(_extract_html_links(section))
+    
+    # 3. Deduplicate by URL
+    unique_citations = {}
+    for citation in citations:
+        url = citation.get("url", "")
+        if url and url not in unique_citations:
+            unique_citations[url] = citation
+    
+    return {
+        "citations": list(unique_citations.values()),
+        "count": len(unique_citations),
+    }
+
+
+def _extract_markdown_links(text: str) -> List[Dict[str, str]]:
+    """
+    Extract Markdown links [title](url).
+    
+    Args:
+        text: Text to extract from
+    
+    Returns:
+        List of citation dictionaries with title, url, and format
    """
    citations = []
-
-    # Find the Key Citations section
-    section_pattern = (
-        r"(?:##\s*Key Citations|##\s*References|##\s*Sources)\s*\n(.*?)(?=\n##|\Z)"
-    )
-    section_match = re.search(section_pattern, report, re.IGNORECASE | re.DOTALL)
-
-    if section_match:
-        section = section_match.group(1)
-
-        # Extract markdown links
-        link_pattern = r"\[([^\]]+)\]\(([^)]+)\)"
-        for match in re.finditer(link_pattern, section):
-            title = match.group(1)
-            url = match.group(2)
-            if url.startswith(("http://", "https://")):
-                citations.append((title, url))
-
+    pattern = r"\[([^\]]+)\]\(([^)]+)\)"
+    
+    for match in re.finditer(pattern, text):
+        title, url = match.groups()
+        if url.startswith(("http://", "https://")):
+            citations.append({
+                "title": title.strip(),
+                "url": url.strip(),
+                "format": "markdown",
+            })
+    
+    return citations
+
+
+def _extract_numbered_citations(text: str) -> List[Dict[str, str]]:
+    """
+    Extract numbered citations [1] Title - URL.
+    
+    Args:
+        text: Text to extract from
+    
+    Returns:
+        List of citation dictionaries
+    """
+    citations = []
+    # Match: [number] title - URL
+    pattern = r"\[\d+\]\s+([^-\n]+?)\s*-\s*(https?://[^\s\n]+)"
+    
+    for match in re.finditer(pattern, text):
+        title, url = match.groups()
+        citations.append({
+            "title": title.strip(),
+            "url": url.strip(),
+            "format": "numbered",
+        })
+    
+    return citations
+
+
+def _extract_footnote_citations(text: str) -> List[Dict[str, str]]:
+    """
+    Extract footnote citations [^1]: Title - URL.
+    
+    Args:
+        text: Text to extract from
+    
+    Returns:
+        List of citation dictionaries
+    """
+    citations = []
+    # Match: [^number]: title - URL
+    pattern = r"\[\^(\d+)\]:\s+([^-\n]+?)\s*-\s*(https?://[^\s\n]+)"
+    
+    for match in re.finditer(pattern, text):
+        _, title, url = match.groups()
+        citations.append({
+            "title": title.strip(),
+            "url": url.strip(),
+            "format": "footnote",
+        })
+    
+    return citations
+
+
+def _extract_html_links(text: str) -> List[Dict[str, str]]:
+    """
+    Extract HTML links <a href="url">title</a>.
+    
+    Args:
+        text: Text to extract from
+    
+    Returns:
+        List of citation dictionaries
+    """
+    citations = []
+    pattern = r'<a\s+(?:[^>]*?\s)?href=(["\'])([^"\']+)\1[^>]*>([^<]+)</a>'
+    
+    for match in re.finditer(pattern, text, re.IGNORECASE):
+        _, url, title = match.groups()
+        if url.startswith(("http://", "https://")):
+            citations.append({
+                "title": title.strip(),
+                "url": url.strip(),
+                "format": "html",
+            })
+    
    return citations
--- a/src/citations/models.py
+++ b/src/citations/models.py
@@ -6,14 +6,14 @@ Citation data models for structured source metadata.
 """

 import hashlib
-from dataclasses import dataclass, field
 from datetime import datetime
 from typing import Any, Dict, List, Optional
 from urllib.parse import urlparse

+from pydantic import BaseModel, ConfigDict, Field

-@dataclass
-class CitationMetadata:
+
+class CitationMetadata(BaseModel):
    """Metadata extracted from a source."""

    # Core identifiers
@@ -32,7 +32,7 @@ class CitationMetadata:
    language: Optional[str] = None

    # Media
-    images: List[str] = field(default_factory=list)
+    images: List[str] = Field(default_factory=list)
    favicon: Optional[str] = None

    # Quality indicators
@@ -40,13 +40,16 @@ class CitationMetadata:
    credibility_score: float = 0.0

    # Timestamps
-    accessed_at: str = field(default_factory=lambda: datetime.now().isoformat())
+    accessed_at: str = Field(default_factory=lambda: datetime.now().isoformat())

    # Additional metadata
-    extra: Dict[str, Any] = field(default_factory=dict)
+    extra: Dict[str, Any] = Field(default_factory=dict)

-    def __post_init__(self):
-        """Extract domain from URL if not provided."""
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    def __init__(self, **data):
+        """Initialize and extract domain from URL if not provided."""
+        super().__init__(**data)
        if not self.domain and self.url:
            try:
                parsed = urlparse(self.url)
@@ -87,7 +90,7 @@ class CitationMetadata:
        """Create from dictionary."""
        # Remove 'id' as it's computed from url
        data = {k: v for k, v in data.items() if k != "id"}
-        return cls(**data)
+        return cls.model_validate(data)

    @classmethod
    def from_search_result(
@@ -107,8 +110,8 @@ class CitationMetadata:
        )


-@dataclass
-class Citation:
+
+class Citation(BaseModel):
    """
    A citation reference that can be used in reports.

@@ -127,6 +130,8 @@ class Citation:
    # Specific quote or fact being cited
    cited_text: Optional[str] = None

+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
    @property
    def id(self) -> str:
        """Get the citation ID from metadata."""
@@ -154,12 +159,14 @@ class Citation:
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "Citation":
        """Create from dictionary."""
-        return cls(
-            number=data["number"],
-            metadata=CitationMetadata.from_dict(data["metadata"]),
-            context=data.get("context"),
-            cited_text=data.get("cited_text"),
-        )
+        return cls.model_validate({
+            "number": data["number"],
+            "metadata": CitationMetadata.from_dict(data["metadata"])
+            if isinstance(data.get("metadata"), dict)
+            else data["metadata"],
+            "context": data.get("context"),
+            "cited_text": data.get("cited_text"),
+        })

    def to_markdown_reference(self) -> str:
        """Generate markdown reference format: [Title](URL)"""